Source code for almanac.utils.fuzzy_matcher

"""Implementation of the ``FuzzyMatcher`` class."""

from collections import namedtuple
from difflib import SequenceMatcher
from operator import attrgetter
from typing import Iterable, Iterator, Tuple

FuzzResult = namedtuple('FuzzResult', ['string', 'ratio'])

_DEFAULT_FUZZ_RATIO_THRESHOLD = 0.6
_DEFAULT_MAX_MATCHES = 3


[docs]class FuzzyMatcher: """A simple fuzzy string matcher. Args: reference: The string against which candidate matches are compared. candidates: The strings to compare against ``reference``. ratio_threshold: The float within the range ``0 < r < 1`` that valid matches must surpass when compared to ``reference``. num_max_matches: The maximum number of matches that this class will hold. """ def __init__( self, reference: str, candidates: Iterable[str], ratio_threshold: float = _DEFAULT_FUZZ_RATIO_THRESHOLD, num_max_matches: int = _DEFAULT_MAX_MATCHES ) -> None: self._reference: str = reference _fuzzes = iter(self.__class__.fuzz(reference, c) for c in candidates) _passing_fuzzes = iter(f for f in _fuzzes if f.ratio > ratio_threshold) _sorted_passing_fuzzes = tuple( sorted(_passing_fuzzes, key=attrgetter('ratio')) ) if len(_sorted_passing_fuzzes) <= num_max_matches: self._results = _sorted_passing_fuzzes else: self._results = _sorted_passing_fuzzes[:num_max_matches] self._matches = tuple(r.string for r in self._results)
[docs] @staticmethod def fuzz( reference: str, comparison: str ) -> FuzzResult: """Return the fuzz ratio of two strings. Higher means more similar.""" ratio = SequenceMatcher(None, reference, comparison).ratio() return FuzzResult(comparison, ratio)
@property def reference( self ) -> str: """The reference string against which candidates are matched.""" return self._reference @property def matches( self ) -> Tuple[str, ...]: """Only the strings of matching candidates.""" return self._matches @property def results( self ) -> Tuple[FuzzResult, ...]: """The `FuzzResult` result of each match.""" return self._results def __bool__( self ) -> bool: return bool(self._matches) def __iter__( self ) -> Iterator[str]: yield from self._matches def __contains__( self, s: str ) -> bool: return s in self._matches