Source code for search.matchers

"""
Matcher functions to determine similarity between two strings

Using different algorithms and given two strings, each function returns a
value between 0 and 1, where 0 is completely different and 1 represent complete
equality (as in "hello world", "hello world")
"""
from search import utils, config


# =====================================================================
# String similarity functions
# ---------------------------------------------------------------------
# A set of functions that calculate the edit distance between two
# strings

def simple_ratio(query, string):
    return utils.ratio(query, string)


def best_token_ratio(query, string):
    query = utils.sorted_unique_tokens(query)
    string = utils.sorted_unique_tokens(string)
    string = utils.stringify_tokens(string)

    prob = 0
    for segment in query:
        match = utils.best_partial_ratio(segment, string)
        prob = match if match > prob else prob
        if prob == 1:
            break
    return prob


[docs]def token_sort_ratio(query, string):
    """
    generate tokens from query and string, then for each query token
    find the best partial ratio on the string and get the average value
    """
    query_tokens = utils.sorted_unique_tokens(query)
    string_tokens = utils.sorted_unique_tokens(string)
    # generate and sort a set of strings, removing duplicates, then
    # join them together again as they are (no spaces or punctuation)
    tokenized_string = utils.stringify_tokens(string_tokens)

    matches = {}
    for q_token in query_tokens:
        # loop every token in the query, adding a default similarity == 0
        # to the matches dictionary
        matches[q_token] = 0
        # loop for every word in the searched string
        for token in utils.shifter(tokenized_string, len(q_token)):
            # and call the slider on each of them, getting the similariy
            # for every extracted token
            match = utils.ratio(q_token, token)
            if match > matches[q_token]:
                matches[q_token] = match
                if match == 1:
                    break

    return utils.average(matches.values())


[docs]def intersect_token_ratio(query, string):
    """
    Perform a match utilizing the intersection method.
    """
    query = utils.sorted_unique_tokens(query)
    string = utils.sorted_unique_tokens(string)
    common, diff_q, diff_s = utils.sorted_intersect(query, string)
    t0 = ''.join(common)            # common elements for query and string
    t1 = ''.join(common + diff_q)   # common plus the diff elements on query
    t2 = ''.join(common + diff_s)   # common plus diff elements on string
    best = 0
    for (q, s) in ((t0, t1), (t1, t2), (t0, t2)):
        match = utils.ratio(q, s)
        if match > best:
            best = match
            if best == 1:
                break

    return best

# =====================================================================
# Full matchers functions
# ---------------------------------------------------------------------
# A set of functions that perform various checks and matches agains
# a query and a string and return a comprehensive value representing
# the equality of the two strings


def lazy_match(query, string):

    query_tokens = utils.tokenize_set(query)
    string_tokens = utils.tokenize_set(string)
    shortest, longest = sorted((query_tokens, string_tokens))
    len_short, len_long = len(shortest), len(longest)

    # If the longest has no length it's useless to continue
    if len_long == 0:
        return 0

    if len_long == 1:
        # len_short == 1 too, so 1 word against 1 word
        return simple_ratio(query, string)
    if len_short == 1 and len_long < 4:
        # at most one word against a short string
        return best_token_ratio(query, string)

    if len_short < 3 and len_long < 5:
        # if the length of the short is enough, try with a
        return token_sort_ratio(query, string)

    else:
        # in any other condition, such as short query against long string
        # use intersect_ratio
        return intersect_token_ratio(query, string)


[docs]def similarity(query, string):
    """
    Calculate the match for the given `query` and `string`.

    The match is calculated using the `jaro winkler` for each set of the matrix
    (`query` x `string`) and takes into consideration the position difference
    into the strings.

    Arguments:
        query (str): search query
        string (str): string to test against

    Returns:
        float: normalized value indicating the probability of match, where
        0 means completely dissimilar and 1 means equal.
    """

    # split the two strings cleaning out some stuff
    query = utils.tokenize(query)
    string = utils.tokenize(string)

    # if one of the two strings is falsy (no content, or was passed with items
    # short enough to be trimmed out), return 0 here to avoid ZeroDivisionError
    # later on while processing.
    if len(query) == 0 or len(string) == 0:
        return 0

    short, long_ = sorted((query, string), key=lambda x: len(x))

    matches = {}
    # generate a matrix from the two strings and loop on every couple
    for string1, string2 in ((s1, s2) for s1 in long_ for s2 in short):
        # get the jaro winkler equality between the two strings
        match = utils.ratio(string1, string2)
        # calculate the distance factor for the position of the segments
        # on their respective lists
        positional = position_similarity(string1, string2, long_, short)

        # get them together and append to the matches dictionary
        matches.setdefault(string1, []).append((match, positional))

    # get the highest value for each list, the apply the word-distance factor
    # the key takes the jaro winkler distance value to get the max value
    matches = [max(m, key=lambda x: x[0]) for m in matches.values()]
    weights_ = utils.scale_to_one((config.MATCH_WEIGHT, config.DIST_WEIGHT))
    matches = [utils.weighted_average((m, d), weights_) for m, d in matches]

    # get the weighted mean for all the highest matches and apply the highest
    # match value found as coefficient as multiplier, to add weights to more
    # coherent matches.
    weights_ = list(range(len(matches), 0, -1))
    return utils.weighted_average(matches, weights_) * max(matches)


# =====================================================================
# Positional similarity


[docs]def position_similarity(el1, el2, seq1, seq2):
    """
    Get the normalized inverted movement cost for for between `el1` and
    `el2` on the `seq2` iterable.

    The function is used to get a value describing how far two words are in a
    phrase (as list, as in ``string.split(' ')`` or, in our case through
    :func:`search.utils.tokenize`).

    Moves are relative to el1 on seq1, which `should` be the longest set
    for the function to work properly.

    .. warning::
        The function is currently broken and always returns 1, making
        the position inside the matching string irrelevant.

    .. note:: The given strings **MUST** be inside the corresponding list.

    Arguments:
        el1 (any): element of the ``seq1`` iterable
        el2 (any): element of the ``seq2`` iterable
        seq1 (iterable): iterable allowing the ``index`` method containing
            the `el1` element.
        seq2 (iterable): iterable allowing the ``index`` method containing
            the `el2` element.

    Returns:
        float: value ``0 -> 1`` representing how far the two words are,
        where ``1`` represent the closest(same position) and tending to zero
        the farthest on the maximum available moves possible on ``seq1``.

    """
    return 1
    # FIXME: Something's wrong here.
    # Function left `on hold` with a return 1 since we need a value and 1
    # should be the default value for "everything is where is supposed to be"
    # -------------------------------------------------------------------------
    # The function SHOULD return a value representing a ratio of word (el1)
    # position inside the sequence that contains it (seq1) relative to the
    # other element (el2) on its sequence (seq2).
    # The reason for this function to exist is to have a value that tells the
    # rest of the search algoritm how much an element is offset relative to
    # the supposed position, allowing the search to give more weight to
    # queries that are written exactly as they are found, while i.e. inverted
    # order should have little less weight.
    # The idea is to have a 1 when:
    # * len(seq1) == len(seq2) AND indexes of elements are the same:
    #       'hi, hello there', 'yo, hello world' -> ('there', 'world')
    # * lengths are different BUT the el1 is proportionally at about the same
    #   position in seq1 as el2 is on el2:
    #       'hello world, 'hello there, how are you' -> ('world', 'are')
    #   meaning that the elements are more or less in the same "spot".
    # * one of the two sequences have length 1:
    #   should be irrelevant any type of calculus since the relative position
    #   of such word will be 0 (index/length) or the reciprocal will be 1
    #   (1 - idx/len)
    #
    # All other cases should return a value between 1 and ->0, tending toward
    # 0 the more the longest sequence is long.
    # =========================================================================

    long_, short = seq1, seq2
    el_long, el_short = el1, el2
    len_long, len_short = len(seq1), len(seq2)

    if len_long == 1 or len_short == 1:
        # useless to count as this would
        # * cause a ZeroDivisionError (no moves available)
        # * return 1 in any case since the algorithm NEEDS to return 1 if one
        #   of the two sequence is of len 1
        return 1

    if len_short > len_long:
        # Reverse if needed
        # NOTE: This should be ok and would prevent dumbness when setting up
        # the function call
        long_, short = short, long_
        el_long, el_short = el_short, el_long
        len_long, len_short = len_short, len_long

    long_idx = long_.index(el_long)
    short_idx = short.index(el_short)
    moves = abs(long_idx - short_idx)
    max_moves = utils.max_distance(long_, word_index=long_idx)

    return abs(1 - (moves / max_moves))