"""
Utility module for the search engine
Contains various functions to do common operations on strings and iterables,
such as normalization, tokenization, averages, splitting and walking through
the elements of the iterable/string.
"""
import re
from difflib import SequenceMatcher # noqa
from search import config
# =====================================================================
# Ratio functions
# ---------------------------------------------------------------------
# Various functions that calculates best/worst match ratio against two
# strings, either directly or by splitting/tokenizing them first
[docs]def ratio(query, string):
"""
Simple ratio between the whole strings. values 0 -> 1.
Should be good for lengths == 1
"""
return SequenceMatcher(None, query, string).ratio()
[docs]def best_partial_ratio(query, string):
"""
Best partial ratio between query and string.
`String` is walked with the shifter function and each segment's length
is equal to the `query` length:
"""
v = 0
for str_segment in shifter(string, len(query)):
m = ratio(query, str_segment)
v = m if m > v else m
if v == 1:
break
return v
# =====================================================================
# Tokenization and processing
# ---------------------------------------------------------------------
# Various functions that perform a tokenization of some sort on a
# string and return eiter a list or a set of tokens (substrings) that
# can be used for better matching.
# NOTE: When tokenizing be sure to use the same tokenizer function for
# both the query and each string to match against
[docs]def splitter(string, chunk_size):
"""
Generator function that returns chunks of `string` of size `chunk_size`.
If chunk_size is ``-1`` the whole string is returned.
Args:
string (str): the string to get the chunks from
chunk_size (int): max length of the chunks (last one can be shorter)
Yields:
one chunks of `string` of size `chunk_size` on each call such as
>>> splitter('hello, world', 3)
'hel', 'lo,', ' wo', 'rld'
"""
l = len(string)
if chunk_size <= 0:
# if requested size is less than zero return the whole string
chunk_size = l
if l == 0:
yield string
return
i = 0
while i < l:
n, i = i, i + chunk_size
yield string[n:i]
[docs]def shifter(string, chunk_size):
"""
Generator function that slides through a string and returns strings
of (max) length `chunk_size`, sliding one character at a time.
Args:
string (str): the string to walk
chunk_size (int): the size of each chunk
Yield:
one chunks of `string` of size `chunk_size` on each call such as
>>> splitter('hello, world', 3)
'hel', 'ell', 'llo', ...
"""
l = len(string)
if chunk_size <= 0:
# if requested size is less than zero return the whole string
chunk_size = l
if l == 0 or chunk_size >= l:
yield string
return
i = 0
stop = l - chunk_size
while i <= stop:
yield string[i:i + chunk_size]
i += 1
[docs]def tokenize(string,
regexp=config.STR_SPLIT_REGEX,
min_len=config.MIN_WORD_LENGTH):
"""
Given a string return a ``list`` of segments of the string,
splitted with :any:`config.STR_SPLIT_REGEX`, removing every word <
:any:`config.MIN_WORD_LENGTH`.
Arguments:
string (str): string to tokenize
regexp (regexp): regular expression to split the string with
min_len (int): minimum word length
Returns:
list - all the tokens extracted from the string in the same
order they were found.
Example:
>>> utils.tokenize('hello there, how are you')
['hello', 'there']
"""
return [
w for w in re.split(regexp, string.lower())
if len(w) > min_len and w not in config.STOP_WORDS
]
[docs]def tokenize_set(string,
regexp=config.STR_SPLIT_REGEX,
min_len=config.MIN_WORD_LENGTH):
"""
Given a string returns a ``set`` of unique segments of the strings (single
words) splitted with :any:`config.STR_SPLIT_REGEX`, removing every word <
:any:`config.MIN_WORD_LENGTH`.
The differences with :any:`tokenize` is that returns a set instead of a
list, so the values are unique.
Arguments:
string (str): string to tokenize
regexp (regexp): regular expression to split the string with
min_len (int): minimum word length
Returns:
set - all the tokens extracted from `string` in a set
Example:
>>> utils.tokenize('hello there, there are some fishes there!')
['hello', 'there']
"""
s = tokenize(string, regexp, min_len)
return set(s)
[docs]def sorted_unique_tokens(
string, regexp=config.STR_SPLIT_REGEX,
min_len=config.MIN_WORD_LENGTH):
"""
Return a sorted list of unique tokens contained in the string
"""
return sorted(tokenize_set(string, regexp, min_len))
[docs]def stringify_tokens(tokens):
"""
Given a list or set of tokens, return them as a single string
"""
return ''.join(tokens)
[docs]def sorted_intersect(query_tokens, string_tokens):
"""
return the sorted intersection and remainders of the two iterables
Arguments:
query_tokens(iterable): first elements group to intersect
string_tokens(iterable): second elements group to intersect
Returns:
tuple of lists:
* sorted intersected list - all values common to both iterables
* sorted remainders for `query_tokens`
* sorted remainders for `string_tokens`
Example:
>> > sorted_intersect([1, 3, 2, 4], [3, 4, 5])
[3, 4], [1, 2], [5]
"""
common = [
i for i in query_tokens for j in string_tokens
if ratio(i, j) >= config.THRESHOLD
]
rest_query = (el for el in query_tokens if el not in common)
rest_string = (el for el in string_tokens if el not in common)
return sorted(common), sorted(rest_query), sorted(rest_string)
# =====================================================================
# Normalization
# ---------------------------------------------------------------------
# Various normalization function, that given an iterable of values
# (numbers) return a list of normalized values.
# Normalization can be either scaled to 1 (higher value == 1) or
# that sums to 1 (sum of all normalized values == 1)
[docs]def normalize(iterable):
"""
Normalize an iterable of numbers in a series that sums up to 1.
Arguments:
iterable(numbers): an iterable containing numbers
(either `int` or `float`) to normalize.
Returns:
list: Normalized float values, in the same order as the one provided.
Example:
>> > normalize([3, 2, 1])
[0.5, 0.333333, 0.166667]
"""
tot = sum(iterable)
return [float(v) / tot for v in iterable]
[docs]def scale_to_one(iterable):
"""
Scale an iterable of numbers proportionally such as the highest number
equals to 1
Example:
>> > scale_to_one([5, 4, 3, 2, 1])
[1, 0.8, 0.6, 0.4, 0.2]
"""
m = max(iterable)
return [v / m for v in iterable]
# =====================================================================
# Averages and distances
# ---------------------------------------------------------------------
# Functions that allows calculating arithmetic average, weighted mean
# or list elements distances (such as max_distance that returns the
# max distance from the given index to the farthest end of the list)
[docs]def average(values):
"""Return the arithmetic average of the values."""
return sum(values) / len(values)
[docs]def weighted_average(values, weights=None):
"""Calculate the weighted mean average between two iterables of `values`
and matching `weights`. If weights is ``None`` they will be autogenerated
Args:
values(iterable): Values to average, either `int` or `float`
weights(iterable): Matching weights iterable
Returns:
float: weighted average
"""
if not weights:
weights = generate_weights(values)
if len(values) != len(weights):
raise ValueError(
'Values and Weights length do not match for weighted average')
val = sum(m * w for m, w in zip(values, weights))
weights = sum(weights)
return val / weights
[docs]def generate_weights(iterable):
"""
Generate normalized weights for the iterable elements in reversed order.
"""
weights = list(range(len(iterable), 0, -1))
return scale_to_one(weights)
[docs]def max_distance(sequence, idx):
"""
Given a list an int in range(len(sequence)), determine the maximum
amount of `movements` available from that index position in the list.
Arguments:
sequence(any with __len__): iterable to calculate the maximum moves
into. It's used only to get its `len`, so any object that
implements the `__len__` method will do.
idx(int): index to count from.
The value ** must ** be `>= 0` but ** can ** be over the length
of the list.
Returns:
int: maximum moves available in any direction.
Example:
>> > utils.get_max_moves(['a', 'b', 'c', 'd', 'e'], 1)
4 # index 1 -> 'b', so max is 4 moves right
>> > utils.get_max_moves(['a', 'b', 'c', 'd', 'e'], 6)
5 # can move left 5
"""
return max(idx, (len(sequence) - (idx + 1)))
# =====================================================================
# Debugging utilities
def _dec(fl):
"""Return a stringified more readable float, for debugging purposes."""
return '{:.2f}'.format(fl)