diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index c9084c359..76c6a6cc7 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -159,14 +159,14 @@ def _convert_strings(token_specs, string_store): def merge_phrase(matcher, doc, i, matches): - '''Callback to merge a phrase on match''' + """Callback to merge a phrase on match.""" ent_id, label, start, end = matches[i] span = doc[start : end] span.merge(ent_type=label, ent_id=ent_id) cdef class Matcher: - '''Match sequences of tokens, based on pattern rules.''' + """Match sequences of tokens, based on pattern rules.""" cdef Pool mem cdef vector[TokenPatternC*] patterns cdef readonly Vocab vocab @@ -175,37 +175,13 @@ cdef class Matcher: cdef public object _callbacks cdef public object _acceptors - @classmethod - def load(cls, path, vocab): - """ - Load the matcher and patterns from a file path. - - Arguments: - path (Path): - Path to a JSON-formatted patterns file. - vocab (Vocab): - The vocabulary that the documents to match over will refer to. - Returns: - Matcher: The newly constructed object. - """ - if (path / 'gazetteer.json').exists(): - with (path / 'gazetteer.json').open('r', encoding='utf8') as file_: - patterns = ujson.load(file_) - else: - patterns = {} - return cls(vocab, patterns) - def __init__(self, vocab, patterns={}): - """ - Create the Matcher. + """Create the Matcher. - Arguments: - vocab (Vocab): - The vocabulary object, which must be shared with the documents - the matcher will operate on. - patterns (dict): Patterns to add to the matcher. - Returns: - The newly constructed object. + vocab (Vocab): The vocabulary object, which must be shared with the + documents the matcher will operate on. + patterns (dict): Patterns to add to the matcher. + RETURNS (Matcher): The newly constructed object. """ self._patterns = {} self._entities = {} @@ -226,22 +202,15 @@ cdef class Matcher: def add_entity(self, entity_key, attrs=None, if_exists='raise', acceptor=None, on_match=None): - """ - Add an entity to the matcher. + # TODO: replace with new Matcher.add() + """Add an entity to the matcher. - Arguments: - entity_key (unicode or int): - An ID for the entity. - attrs: - Attributes to associate with the Matcher. - if_exists ('raise', 'ignore' or 'update'): - Controls what happens if the entity ID already exists. Defaults to 'raise'. - acceptor: - Callback function to filter matches of the entity. - on_match: - Callback function to act on matches of the entity. - Returns: - None + entity_key (unicode or int): An ID for the entity. + attrs (dict): Attributes to associate with the `Matcher`. + if_exists (unicode): `'raise'`, `'ignore'` or `'update'`. Controls what + happens if the entity ID already exists. Defaults to `'raise'`. + acceptor (function): Callback function to filter matches of the entity. + on_match (function): Callback function to act on matches of the entity. """ if if_exists not in ('raise', 'ignore', 'update'): raise ValueError( @@ -264,18 +233,12 @@ cdef class Matcher: self._callbacks[entity_key] = on_match def add_pattern(self, entity_key, token_specs, label=""): - """ - Add a pattern to the matcher. + # TODO: replace with new Matcher.add() + """Add a pattern to the matcher. - Arguments: - entity_key (unicode or int): - An ID for the entity. - token_specs: - Description of the pattern to be matched. - label: - Label to assign to the matched pattern. Defaults to "". - Returns: - None + entity_key (unicode): An ID for the entity. + token_specs (list): Description of the pattern to be matched. + label (unicode): Label to assign to the matched pattern. Defaults to `""`. """ token_specs = list(token_specs) if len(token_specs) == 0: @@ -296,6 +259,7 @@ cdef class Matcher: self._patterns[entity_key].append((label, token_specs)) def add(self, entity_key, label, attrs, specs, acceptor=None, on_match=None): + # TODO: replace with new Matcher.add() self.add_entity(entity_key, attrs=attrs, if_exists='update', acceptor=acceptor, on_match=on_match) for spec in specs: @@ -308,25 +272,21 @@ cdef class Matcher: return entity_key def has_entity(self, entity_key): - """ - Check whether the matcher has an entity. + # TODO: deprecate + """Check whether the matcher has an entity. - Arguments: - entity_key (string or int): The entity key to check. - Returns: - bool: Whether the matcher has the entity. + entity_key (string or int): The entity key to check. + RETURNS (bool): Whether the matcher has the entity. """ entity_key = self.normalize_entity_key(entity_key) return entity_key in self._entities def get_entity(self, entity_key): - """ - Retrieve the attributes stored for an entity. + # TODO: deprecate + """Retrieve the attributes stored for an entity. - Arguments: - entity_key (unicode or int): The entity to retrieve. - Returns: - The entity attributes if present, otherwise None. + entity_key (unicode or int): The entity to retrieve. + RETURNS (dict): The entity attributes if present, otherwise None. """ entity_key = self.normalize_entity_key(entity_key) if entity_key in self._entities: @@ -335,17 +295,12 @@ cdef class Matcher: return None def __call__(self, Doc doc, acceptor=None): - """ - Find all token sequences matching the supplied patterns on the Doc. + """Find all token sequences matching the supplied patterns on the `Doc`. - Arguments: - doc (Doc): - The document to match over. - Returns: - list - A list of (entity_key, label_id, start, end) tuples, - describing the matches. A match tuple describes a span doc[start:end]. - The label_id and entity_key are both integers. + doc (Doc): The document to match over. + RETURNS (list): A list of `(entity_key, label_id, start, end)` tuples, + describing the matches. A match tuple describes a span + `doc[start:end]`. The `label_id` and `entity_key` are both integers. """ if acceptor is not None: raise ValueError( @@ -449,18 +404,13 @@ cdef class Matcher: return matches def pipe(self, docs, batch_size=1000, n_threads=2): - """ - Match a stream of documents, yielding them in turn. + """Match a stream of documents, yielding them in turn. - Arguments: - docs: A stream of documents. - batch_size (int): - The number of documents to accumulate into a working set. - n_threads (int): - The number of threads with which to work on the buffer in parallel, - if the Matcher implementation supports multi-threading. - Yields: - Doc Documents, in order. + docs (iterable): A stream of documents. + batch_size (int): The number of documents to accumulate into a working set. + n_threads (int): The number of threads with which to work on the buffer + in parallel, if the `Matcher` implementation supports multi-threading. + YIELDS (Doc): Documents, in order. """ for doc in docs: self(doc) diff --git a/website/docs/api/matcher.jade b/website/docs/api/matcher.jade index 630c10df2..bfdd63813 100644 --- a/website/docs/api/matcher.jade +++ b/website/docs/api/matcher.jade @@ -4,31 +4,26 @@ include ../../_includes/_mixins p Match sequences of tokens, based on pattern rules. -+h(2, "load") Matcher.load - +tag classmethod - -p Load the matcher and patterns from a file path. - -+table(["Name", "Type", "Description"]) - +row - +cell #[code path] - +cell #[code Path] - +cell Path to a JSON-formatted patterns file. - - +row - +cell #[code vocab] - +cell #[code Vocab] - +cell The vocabulary that the documents to match over will refer to. - - +footrow - +cell returns - +cell #[code Matcher] - +cell The newly constructed object. ++infobox("⚠️ Deprecation note") + | As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity] + | are deprecated and have been replaced with a simpler + | #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of + | patterns and a callback for a given match ID. + | #[code Matcher.load] (not useful, as it didn't allow specifying callbacks), + | #[code Matcher.has_entity] and #[code Matcher.get_entity] (now redundant) + | have been removed. +h(2, "init") Matcher.__init__ +tag method -p Create the Matcher. +p Create the rule-based #[code Matcher]. + ++aside-code("Example"). + from spacy.matcher import Matcher + from spacy.attrs import LOWER + + patterns = {"HelloWorld": [{LOWER: "hello"}, {LOWER: "world"}]} + matcher = Matcher(nlp.vocab) +table(["Name", "Type", "Description"]) +row @@ -41,7 +36,7 @@ p Create the Matcher. +row +cell #[code patterns] +cell dict - +cell Patterns to add to the matcher. + +cell Patterns to add to the matcher, keyed by ID. +footrow +cell returns @@ -51,7 +46,28 @@ p Create the Matcher. +h(2, "call") Matcher.__call__ +tag method -p Find all token sequences matching the supplied patterns on the Doc. +p Find all token sequences matching the supplied patterns on the #[code Doc]. + ++aside-code("Example"). + from spacy.matcher import Matcher + from spacy.attrs import LOWER + + matcher = Matcher(nlp.vocab) + pattern = [{LOWER: "hello"}, {LOWER: "world"}] + matcher.add_pattern("HelloWorld", pattern, on_match=None) + doc = nlp(u'hello world!') + matches = matcher(doc) + ++infobox("Important note") + | By default, the matcher #[strong does not perform any action] on matches, + | like tagging matched phrases with entity types. Instead, actions need to + | be specified when #[strong adding patterns or entities], by + | passing in a callback function as the #[code on_match] argument on + | #[+api("matcher#add") #[code add]]. This allows you to define custom + | actions per pattern within the same matcher. For example, you might only + | want to merge some entity types, and set custom flags for other matched + | patterns. For more details and examples, see the usage workflow on + | #[+a("/docs/usage/rule-based-matching") rule-based matching]. +table(["Name", "Type", "Description"]) +row @@ -76,7 +92,7 @@ p Match a stream of documents, yielding them in turn. +table(["Name", "Type", "Description"]) +row +cell #[code docs] - +cell - + +cell iterable +cell A stream of documents. +row @@ -97,83 +113,44 @@ p Match a stream of documents, yielding them in turn. +cell #[code Doc] +cell Documents, in order. -+h(2, "add_entity") Matcher.add_entity ++h(2, "add_pattern") Matcher.add +tag method -p Add an entity to the matcher. +p + | Add one or more patterns to the matcher, along with a callback function + | to handle the matches. The callback function will receive the arguments + | #[code matcher], #[code doc], #[code id] and #[code matches]. + ++aside-code("Example"). + from spacy.matcher import Matcher + from spacy.attrs import LOWER, ORTH + + def on_match(matcher, doc, id, matches): + print('Matched!', matches) + + matcher = Matcher(nlp.vocab) + matcher.add('HelloWorld', [{LOWER: "hello"}, {LOWER: "world"}], on_match=on_match) + matcher.add('GoogleMaps', [{ORTH: "Google"}, {ORTH: "Maps"}], on_match=on_match) + + doc = nlp(u'HELLO WORLD on Google Maps.') + matches = matcher(doc) +table(["Name", "Type", "Description"]) +row - +cell #[code entity_key] - +cell unicode / int - +cell An ID for the entity. - - +row - +cell #[code attrs] - +cell - - +cell Attributes to associate with the Matcher. - - +row - +cell #[code if_exists] + +cell #[code match_id] +cell unicode - +cell - | #[code 'raise'], #[code 'ignore'] or #[code 'update']. Controls - | what happens if the entity ID already exists. Defaults to - | #[code 'raise']. + +cell An ID for the thing you're matching. +row - +cell #[code acceptor] - +cell - - +cell Callback function to filter matches of the entity. + +cell #[code *patterns] + +cell list + +cell + | Match pattern. A pattern consists of a list of dicts, where each + | dict describes a token. +row +cell #[code on_match] - +cell - - +cell Callback function to act on matches of the entity. - - +footrow - +cell returns - +cell #[code None] - +cell - - -+h(2, "add_pattern") Matcher.add_pattern - +tag method - -p Add a pattern to the matcher. - -+table(["Name", "Type", "Description"]) - +row - +cell #[code entity_key] - +cell unicode / int - +cell An ID for the entity. - - +row - +cell #[code token_specs] - +cell - - +cell Description of the pattern to be matched. - - +row - +cell #[code label] - +cell unicode / int - +cell Label to assign to the matched pattern. Defaults to #[code ""]. - - +footrow - +cell returns - +cell #[code None] - +cell - - -+h(2, "has_entity") Matcher.has_entity - +tag method - -p Check whether the matcher has an entity. - -+table(["Name", "Type", "Description"]) - +row - +cell #[code entity_key] - +cell unicode / int - +cell The entity key to check. - - +footrow - +cell returns - +cell bool - +cell Whether the matcher has the entity. + +cell function + +cell + | Callback function to act on matches. Takes the arguments + | #[code matcher], #[code doc], #[code id] and #[code matches].