Source code for search.utils

"""
Utility module for the search engine

Contains various functions to do common operations on strings and iterables,
such as normalization, tokenization, averages, splitting and walking through
the elements of the iterable/string.
"""
import re
from difflib import SequenceMatcher  # noqa

from search import config


# =====================================================================
# Ratio functions
# ---------------------------------------------------------------------
# Various functions that calculates best/worst match ratio against two
# strings, either directly or by splitting/tokenizing them first


[docs]def ratio(query, string): """ Simple ratio between the whole strings. values 0 -> 1. Should be good for lengths == 1 """ return SequenceMatcher(None, query, string).ratio()
[docs]def best_partial_ratio(query, string): """ Best partial ratio between query and string. `String` is walked with the shifter function and each segment's length is equal to the `query` length: """ v = 0 for str_segment in shifter(string, len(query)): m = ratio(query, str_segment) v = m if m > v else m if v == 1: break return v
# ===================================================================== # Tokenization and processing # --------------------------------------------------------------------- # Various functions that perform a tokenization of some sort on a # string and return eiter a list or a set of tokens (substrings) that # can be used for better matching. # NOTE: When tokenizing be sure to use the same tokenizer function for # both the query and each string to match against
[docs]def splitter(string, chunk_size): """ Generator function that returns chunks of `string` of size `chunk_size`. If chunk_size is ``-1`` the whole string is returned. Args: string (str): the string to get the chunks from chunk_size (int): max length of the chunks (last one can be shorter) Yields: one chunks of `string` of size `chunk_size` on each call such as >>> splitter('hello, world', 3) 'hel', 'lo,', ' wo', 'rld' """ l = len(string) if chunk_size <= 0: # if requested size is less than zero return the whole string chunk_size = l if l == 0: yield string return i = 0 while i < l: n, i = i, i + chunk_size yield string[n:i]
[docs]def shifter(string, chunk_size): """ Generator function that slides through a string and returns strings of (max) length `chunk_size`, sliding one character at a time. Args: string (str): the string to walk chunk_size (int): the size of each chunk Yield: one chunks of `string` of size `chunk_size` on each call such as >>> splitter('hello, world', 3) 'hel', 'ell', 'llo', ... """ l = len(string) if chunk_size <= 0: # if requested size is less than zero return the whole string chunk_size = l if l == 0 or chunk_size >= l: yield string return i = 0 stop = l - chunk_size while i <= stop: yield string[i:i + chunk_size] i += 1
[docs]def tokenize(string, regexp=config.STR_SPLIT_REGEX, min_len=config.MIN_WORD_LENGTH): """ Given a string return a ``list`` of segments of the string, splitted with :any:`config.STR_SPLIT_REGEX`, removing every word < :any:`config.MIN_WORD_LENGTH`. Arguments: string (str): string to tokenize regexp (regexp): regular expression to split the string with min_len (int): minimum word length Returns: list - all the tokens extracted from the string in the same order they were found. Example: >>> utils.tokenize('hello there, how are you') ['hello', 'there'] """ return [ w for w in re.split(regexp, string.lower()) if len(w) > min_len and w not in config.STOP_WORDS ]
[docs]def tokenize_set(string, regexp=config.STR_SPLIT_REGEX, min_len=config.MIN_WORD_LENGTH): """ Given a string returns a ``set`` of unique segments of the strings (single words) splitted with :any:`config.STR_SPLIT_REGEX`, removing every word < :any:`config.MIN_WORD_LENGTH`. The differences with :any:`tokenize` is that returns a set instead of a list, so the values are unique. Arguments: string (str): string to tokenize regexp (regexp): regular expression to split the string with min_len (int): minimum word length Returns: set - all the tokens extracted from `string` in a set Example: >>> utils.tokenize('hello there, there are some fishes there!') ['hello', 'there'] """ s = tokenize(string, regexp, min_len) return set(s)
[docs]def sorted_unique_tokens( string, regexp=config.STR_SPLIT_REGEX, min_len=config.MIN_WORD_LENGTH): """ Return a sorted list of unique tokens contained in the string """ return sorted(tokenize_set(string, regexp, min_len))
[docs]def stringify_tokens(tokens): """ Given a list or set of tokens, return them as a single string """ return ''.join(tokens)
[docs]def sorted_intersect(query_tokens, string_tokens): """ return the sorted intersection and remainders of the two iterables Arguments: query_tokens(iterable): first elements group to intersect string_tokens(iterable): second elements group to intersect Returns: tuple of lists: * sorted intersected list - all values common to both iterables * sorted remainders for `query_tokens` * sorted remainders for `string_tokens` Example: >> > sorted_intersect([1, 3, 2, 4], [3, 4, 5]) [3, 4], [1, 2], [5] """ common = [ i for i in query_tokens for j in string_tokens if ratio(i, j) >= config.THRESHOLD ] rest_query = (el for el in query_tokens if el not in common) rest_string = (el for el in string_tokens if el not in common) return sorted(common), sorted(rest_query), sorted(rest_string)
# ===================================================================== # Normalization # --------------------------------------------------------------------- # Various normalization function, that given an iterable of values # (numbers) return a list of normalized values. # Normalization can be either scaled to 1 (higher value == 1) or # that sums to 1 (sum of all normalized values == 1)
[docs]def normalize(iterable): """ Normalize an iterable of numbers in a series that sums up to 1. Arguments: iterable(numbers): an iterable containing numbers (either `int` or `float`) to normalize. Returns: list: Normalized float values, in the same order as the one provided. Example: >> > normalize([3, 2, 1]) [0.5, 0.333333, 0.166667] """ tot = sum(iterable) return [float(v) / tot for v in iterable]
[docs]def scale_to_one(iterable): """ Scale an iterable of numbers proportionally such as the highest number equals to 1 Example: >> > scale_to_one([5, 4, 3, 2, 1]) [1, 0.8, 0.6, 0.4, 0.2] """ m = max(iterable) return [v / m for v in iterable]
# ===================================================================== # Averages and distances # --------------------------------------------------------------------- # Functions that allows calculating arithmetic average, weighted mean # or list elements distances (such as max_distance that returns the # max distance from the given index to the farthest end of the list)
[docs]def average(values): """Return the arithmetic average of the values.""" return sum(values) / len(values)
[docs]def weighted_average(values, weights=None): """Calculate the weighted mean average between two iterables of `values` and matching `weights`. If weights is ``None`` they will be autogenerated Args: values(iterable): Values to average, either `int` or `float` weights(iterable): Matching weights iterable Returns: float: weighted average """ if not weights: weights = generate_weights(values) if len(values) != len(weights): raise ValueError( 'Values and Weights length do not match for weighted average') val = sum(m * w for m, w in zip(values, weights)) weights = sum(weights) return val / weights
[docs]def generate_weights(iterable): """ Generate normalized weights for the iterable elements in reversed order. """ weights = list(range(len(iterable), 0, -1)) return scale_to_one(weights)
[docs]def max_distance(sequence, idx): """ Given a list an int in range(len(sequence)), determine the maximum amount of `movements` available from that index position in the list. Arguments: sequence(any with __len__): iterable to calculate the maximum moves into. It's used only to get its `len`, so any object that implements the `__len__` method will do. idx(int): index to count from. The value ** must ** be `>= 0` but ** can ** be over the length of the list. Returns: int: maximum moves available in any direction. Example: >> > utils.get_max_moves(['a', 'b', 'c', 'd', 'e'], 1) 4 # index 1 -> 'b', so max is 4 moves right >> > utils.get_max_moves(['a', 'b', 'c', 'd', 'e'], 6) 5 # can move left 5 """ return max(idx, (len(sequence) - (idx + 1)))
# ===================================================================== # Debugging utilities def _dec(fl): """Return a stringified more readable float, for debugging purposes.""" return '{:.2f}'.format(fl)