Source code for search.matchers

"""
Matcher functions to determine similarity between two strings

Using different algorithms and given two strings, each function returns a
value between 0 and 1, where 0 is completely different and 1 represent complete
equality (as in "hello world", "hello world")
"""
from search import utils, config


# =====================================================================
# String similarity functions
# ---------------------------------------------------------------------
# A set of functions that calculate the edit distance between two
# strings

def simple_ratio(query, string):
    return utils.ratio(query, string)


def best_token_ratio(query, string):
    query = utils.sorted_unique_tokens(query)
    string = utils.sorted_unique_tokens(string)
    string = utils.stringify_tokens(string)

    prob = 0
    for segment in query:
        match = utils.best_partial_ratio(segment, string)
        prob = match if match > prob else prob
        if prob == 1:
            break
    return prob


[docs]def token_sort_ratio(query, string): """ generate tokens from query and string, then for each query token find the best partial ratio on the string and get the average value """ query_tokens = utils.sorted_unique_tokens(query) string_tokens = utils.sorted_unique_tokens(string) # generate and sort a set of strings, removing duplicates, then # join them together again as they are (no spaces or punctuation) tokenized_string = utils.stringify_tokens(string_tokens) matches = {} for q_token in query_tokens: # loop every token in the query, adding a default similarity == 0 # to the matches dictionary matches[q_token] = 0 # loop for every word in the searched string for token in utils.shifter(tokenized_string, len(q_token)): # and call the slider on each of them, getting the similariy # for every extracted token match = utils.ratio(q_token, token) if match > matches[q_token]: matches[q_token] = match if match == 1: break return utils.average(matches.values())
[docs]def intersect_token_ratio(query, string): """ Perform a match utilizing the intersection method. """ query = utils.sorted_unique_tokens(query) string = utils.sorted_unique_tokens(string) common, diff_q, diff_s = utils.sorted_intersect(query, string) t0 = ''.join(common) # common elements for query and string t1 = ''.join(common + diff_q) # common plus the diff elements on query t2 = ''.join(common + diff_s) # common plus diff elements on string best = 0 for (q, s) in ((t0, t1), (t1, t2), (t0, t2)): match = utils.ratio(q, s) if match > best: best = match if best == 1: break return best
# ===================================================================== # Full matchers functions # --------------------------------------------------------------------- # A set of functions that perform various checks and matches agains # a query and a string and return a comprehensive value representing # the equality of the two strings def lazy_match(query, string): query_tokens = utils.tokenize_set(query) string_tokens = utils.tokenize_set(string) shortest, longest = sorted((query_tokens, string_tokens)) len_short, len_long = len(shortest), len(longest) # If the longest has no length it's useless to continue if len_long == 0: return 0 if len_long == 1: # len_short == 1 too, so 1 word against 1 word return simple_ratio(query, string) if len_short == 1 and len_long < 4: # at most one word against a short string return best_token_ratio(query, string) if len_short < 3 and len_long < 5: # if the length of the short is enough, try with a return token_sort_ratio(query, string) else: # in any other condition, such as short query against long string # use intersect_ratio return intersect_token_ratio(query, string)
[docs]def similarity(query, string): """ Calculate the match for the given `query` and `string`. The match is calculated using the `jaro winkler` for each set of the matrix (`query` x `string`) and takes into consideration the position difference into the strings. Arguments: query (str): search query string (str): string to test against Returns: float: normalized value indicating the probability of match, where 0 means completely dissimilar and 1 means equal. """ # split the two strings cleaning out some stuff query = utils.tokenize(query) string = utils.tokenize(string) # if one of the two strings is falsy (no content, or was passed with items # short enough to be trimmed out), return 0 here to avoid ZeroDivisionError # later on while processing. if len(query) == 0 or len(string) == 0: return 0 short, long_ = sorted((query, string), key=lambda x: len(x)) matches = {} # generate a matrix from the two strings and loop on every couple for string1, string2 in ((s1, s2) for s1 in long_ for s2 in short): # get the jaro winkler equality between the two strings match = utils.ratio(string1, string2) # calculate the distance factor for the position of the segments # on their respective lists positional = position_similarity(string1, string2, long_, short) # get them together and append to the matches dictionary matches.setdefault(string1, []).append((match, positional)) # get the highest value for each list, the apply the word-distance factor # the key takes the jaro winkler distance value to get the max value matches = [max(m, key=lambda x: x[0]) for m in matches.values()] weights_ = utils.scale_to_one((config.MATCH_WEIGHT, config.DIST_WEIGHT)) matches = [utils.weighted_average((m, d), weights_) for m, d in matches] # get the weighted mean for all the highest matches and apply the highest # match value found as coefficient as multiplier, to add weights to more # coherent matches. weights_ = list(range(len(matches), 0, -1)) return utils.weighted_average(matches, weights_) * max(matches)
# ===================================================================== # Positional similarity
[docs]def position_similarity(el1, el2, seq1, seq2): """ Get the normalized inverted movement cost for for between `el1` and `el2` on the `seq2` iterable. The function is used to get a value describing how far two words are in a phrase (as list, as in ``string.split(' ')`` or, in our case through :func:`search.utils.tokenize`). Moves are relative to el1 on seq1, which `should` be the longest set for the function to work properly. .. warning:: The function is currently broken and always returns 1, making the position inside the matching string irrelevant. .. note:: The given strings **MUST** be inside the corresponding list. Arguments: el1 (any): element of the ``seq1`` iterable el2 (any): element of the ``seq2`` iterable seq1 (iterable): iterable allowing the ``index`` method containing the `el1` element. seq2 (iterable): iterable allowing the ``index`` method containing the `el2` element. Returns: float: value ``0 -> 1`` representing how far the two words are, where ``1`` represent the closest(same position) and tending to zero the farthest on the maximum available moves possible on ``seq1``. """ return 1 # FIXME: Something's wrong here. # Function left `on hold` with a return 1 since we need a value and 1 # should be the default value for "everything is where is supposed to be" # ------------------------------------------------------------------------- # The function SHOULD return a value representing a ratio of word (el1) # position inside the sequence that contains it (seq1) relative to the # other element (el2) on its sequence (seq2). # The reason for this function to exist is to have a value that tells the # rest of the search algoritm how much an element is offset relative to # the supposed position, allowing the search to give more weight to # queries that are written exactly as they are found, while i.e. inverted # order should have little less weight. # The idea is to have a 1 when: # * len(seq1) == len(seq2) AND indexes of elements are the same: # 'hi, hello there', 'yo, hello world' -> ('there', 'world') # * lengths are different BUT the el1 is proportionally at about the same # position in seq1 as el2 is on el2: # 'hello world, 'hello there, how are you' -> ('world', 'are') # meaning that the elements are more or less in the same "spot". # * one of the two sequences have length 1: # should be irrelevant any type of calculus since the relative position # of such word will be 0 (index/length) or the reciprocal will be 1 # (1 - idx/len) # # All other cases should return a value between 1 and ->0, tending toward # 0 the more the longest sequence is long. # ========================================================================= long_, short = seq1, seq2 el_long, el_short = el1, el2 len_long, len_short = len(seq1), len(seq2) if len_long == 1 or len_short == 1: # useless to count as this would # * cause a ZeroDivisionError (no moves available) # * return 1 in any case since the algorithm NEEDS to return 1 if one # of the two sequence is of len 1 return 1 if len_short > len_long: # Reverse if needed # NOTE: This should be ok and would prevent dumbness when setting up # the function call long_, short = short, long_ el_long, el_short = el_short, el_long len_long, len_short = len_short, len_long long_idx = long_.index(el_long) short_idx = short.index(el_short) moves = abs(long_idx - short_idx) max_moves = utils.max_distance(long_, word_index=long_idx) return abs(1 - (moves / max_moves))