Update Matcher docstrings and API docs

This commit is contained in:
ines 2017-05-19 21:47:06 +02:00
parent c8580da686
commit fe5d8819ea
2 changed files with 110 additions and 183 deletions

View File

@ -159,14 +159,14 @@ def _convert_strings(token_specs, string_store):
def merge_phrase(matcher, doc, i, matches): def merge_phrase(matcher, doc, i, matches):
'''Callback to merge a phrase on match''' """Callback to merge a phrase on match."""
ent_id, label, start, end = matches[i] ent_id, label, start, end = matches[i]
span = doc[start : end] span = doc[start : end]
span.merge(ent_type=label, ent_id=ent_id) span.merge(ent_type=label, ent_id=ent_id)
cdef class Matcher: cdef class Matcher:
'''Match sequences of tokens, based on pattern rules.''' """Match sequences of tokens, based on pattern rules."""
cdef Pool mem cdef Pool mem
cdef vector[TokenPatternC*] patterns cdef vector[TokenPatternC*] patterns
cdef readonly Vocab vocab cdef readonly Vocab vocab
@ -175,37 +175,13 @@ cdef class Matcher:
cdef public object _callbacks cdef public object _callbacks
cdef public object _acceptors cdef public object _acceptors
@classmethod
def load(cls, path, vocab):
"""
Load the matcher and patterns from a file path.
Arguments:
path (Path):
Path to a JSON-formatted patterns file.
vocab (Vocab):
The vocabulary that the documents to match over will refer to.
Returns:
Matcher: The newly constructed object.
"""
if (path / 'gazetteer.json').exists():
with (path / 'gazetteer.json').open('r', encoding='utf8') as file_:
patterns = ujson.load(file_)
else:
patterns = {}
return cls(vocab, patterns)
def __init__(self, vocab, patterns={}): def __init__(self, vocab, patterns={}):
""" """Create the Matcher.
Create the Matcher.
Arguments: vocab (Vocab): The vocabulary object, which must be shared with the
vocab (Vocab): documents the matcher will operate on.
The vocabulary object, which must be shared with the documents
the matcher will operate on.
patterns (dict): Patterns to add to the matcher. patterns (dict): Patterns to add to the matcher.
Returns: RETURNS (Matcher): The newly constructed object.
The newly constructed object.
""" """
self._patterns = {} self._patterns = {}
self._entities = {} self._entities = {}
@ -226,22 +202,15 @@ cdef class Matcher:
def add_entity(self, entity_key, attrs=None, if_exists='raise', def add_entity(self, entity_key, attrs=None, if_exists='raise',
acceptor=None, on_match=None): acceptor=None, on_match=None):
""" # TODO: replace with new Matcher.add()
Add an entity to the matcher. """Add an entity to the matcher.
Arguments: entity_key (unicode or int): An ID for the entity.
entity_key (unicode or int): attrs (dict): Attributes to associate with the `Matcher`.
An ID for the entity. if_exists (unicode): `'raise'`, `'ignore'` or `'update'`. Controls what
attrs: happens if the entity ID already exists. Defaults to `'raise'`.
Attributes to associate with the Matcher. acceptor (function): Callback function to filter matches of the entity.
if_exists ('raise', 'ignore' or 'update'): on_match (function): Callback function to act on matches of the entity.
Controls what happens if the entity ID already exists. Defaults to 'raise'.
acceptor:
Callback function to filter matches of the entity.
on_match:
Callback function to act on matches of the entity.
Returns:
None
""" """
if if_exists not in ('raise', 'ignore', 'update'): if if_exists not in ('raise', 'ignore', 'update'):
raise ValueError( raise ValueError(
@ -264,18 +233,12 @@ cdef class Matcher:
self._callbacks[entity_key] = on_match self._callbacks[entity_key] = on_match
def add_pattern(self, entity_key, token_specs, label=""): def add_pattern(self, entity_key, token_specs, label=""):
""" # TODO: replace with new Matcher.add()
Add a pattern to the matcher. """Add a pattern to the matcher.
Arguments: entity_key (unicode): An ID for the entity.
entity_key (unicode or int): token_specs (list): Description of the pattern to be matched.
An ID for the entity. label (unicode): Label to assign to the matched pattern. Defaults to `""`.
token_specs:
Description of the pattern to be matched.
label:
Label to assign to the matched pattern. Defaults to "".
Returns:
None
""" """
token_specs = list(token_specs) token_specs = list(token_specs)
if len(token_specs) == 0: if len(token_specs) == 0:
@ -296,6 +259,7 @@ cdef class Matcher:
self._patterns[entity_key].append((label, token_specs)) self._patterns[entity_key].append((label, token_specs))
def add(self, entity_key, label, attrs, specs, acceptor=None, on_match=None): def add(self, entity_key, label, attrs, specs, acceptor=None, on_match=None):
# TODO: replace with new Matcher.add()
self.add_entity(entity_key, attrs=attrs, if_exists='update', self.add_entity(entity_key, attrs=attrs, if_exists='update',
acceptor=acceptor, on_match=on_match) acceptor=acceptor, on_match=on_match)
for spec in specs: for spec in specs:
@ -308,25 +272,21 @@ cdef class Matcher:
return entity_key return entity_key
def has_entity(self, entity_key): def has_entity(self, entity_key):
""" # TODO: deprecate
Check whether the matcher has an entity. """Check whether the matcher has an entity.
Arguments:
entity_key (string or int): The entity key to check. entity_key (string or int): The entity key to check.
Returns: RETURNS (bool): Whether the matcher has the entity.
bool: Whether the matcher has the entity.
""" """
entity_key = self.normalize_entity_key(entity_key) entity_key = self.normalize_entity_key(entity_key)
return entity_key in self._entities return entity_key in self._entities
def get_entity(self, entity_key): def get_entity(self, entity_key):
""" # TODO: deprecate
Retrieve the attributes stored for an entity. """Retrieve the attributes stored for an entity.
Arguments:
entity_key (unicode or int): The entity to retrieve. entity_key (unicode or int): The entity to retrieve.
Returns: RETURNS (dict): The entity attributes if present, otherwise None.
The entity attributes if present, otherwise None.
""" """
entity_key = self.normalize_entity_key(entity_key) entity_key = self.normalize_entity_key(entity_key)
if entity_key in self._entities: if entity_key in self._entities:
@ -335,17 +295,12 @@ cdef class Matcher:
return None return None
def __call__(self, Doc doc, acceptor=None): def __call__(self, Doc doc, acceptor=None):
""" """Find all token sequences matching the supplied patterns on the `Doc`.
Find all token sequences matching the supplied patterns on the Doc.
Arguments: doc (Doc): The document to match over.
doc (Doc): RETURNS (list): A list of `(entity_key, label_id, start, end)` tuples,
The document to match over. describing the matches. A match tuple describes a span
Returns: `doc[start:end]`. The `label_id` and `entity_key` are both integers.
list
A list of (entity_key, label_id, start, end) tuples,
describing the matches. A match tuple describes a span doc[start:end].
The label_id and entity_key are both integers.
""" """
if acceptor is not None: if acceptor is not None:
raise ValueError( raise ValueError(
@ -449,18 +404,13 @@ cdef class Matcher:
return matches return matches
def pipe(self, docs, batch_size=1000, n_threads=2): def pipe(self, docs, batch_size=1000, n_threads=2):
""" """Match a stream of documents, yielding them in turn.
Match a stream of documents, yielding them in turn.
Arguments: docs (iterable): A stream of documents.
docs: A stream of documents. batch_size (int): The number of documents to accumulate into a working set.
batch_size (int): n_threads (int): The number of threads with which to work on the buffer
The number of documents to accumulate into a working set. in parallel, if the `Matcher` implementation supports multi-threading.
n_threads (int): YIELDS (Doc): Documents, in order.
The number of threads with which to work on the buffer in parallel,
if the Matcher implementation supports multi-threading.
Yields:
Doc Documents, in order.
""" """
for doc in docs: for doc in docs:
self(doc) self(doc)

View File

@ -4,31 +4,26 @@ include ../../_includes/_mixins
p Match sequences of tokens, based on pattern rules. p Match sequences of tokens, based on pattern rules.
+h(2, "load") Matcher.load +infobox("⚠️ Deprecation note")
+tag classmethod | As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
| are deprecated and have been replaced with a simpler
p Load the matcher and patterns from a file path. | #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
| patterns and a callback for a given match ID.
+table(["Name", "Type", "Description"]) | #[code Matcher.load] (not useful, as it didn't allow specifying callbacks),
+row | #[code Matcher.has_entity] and #[code Matcher.get_entity] (now redundant)
+cell #[code path] | have been removed.
+cell #[code Path]
+cell Path to a JSON-formatted patterns file.
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell The vocabulary that the documents to match over will refer to.
+footrow
+cell returns
+cell #[code Matcher]
+cell The newly constructed object.
+h(2, "init") Matcher.__init__ +h(2, "init") Matcher.__init__
+tag method +tag method
p Create the Matcher. p Create the rule-based #[code Matcher].
+aside-code("Example").
from spacy.matcher import Matcher
from spacy.attrs import LOWER
patterns = {"HelloWorld": [{LOWER: "hello"}, {LOWER: "world"}]}
matcher = Matcher(nlp.vocab)
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
@ -41,7 +36,7 @@ p Create the Matcher.
+row +row
+cell #[code patterns] +cell #[code patterns]
+cell dict +cell dict
+cell Patterns to add to the matcher. +cell Patterns to add to the matcher, keyed by ID.
+footrow +footrow
+cell returns +cell returns
@ -51,7 +46,28 @@ p Create the Matcher.
+h(2, "call") Matcher.__call__ +h(2, "call") Matcher.__call__
+tag method +tag method
p Find all token sequences matching the supplied patterns on the Doc. p Find all token sequences matching the supplied patterns on the #[code Doc].
+aside-code("Example").
from spacy.matcher import Matcher
from spacy.attrs import LOWER
matcher = Matcher(nlp.vocab)
pattern = [{LOWER: "hello"}, {LOWER: "world"}]
matcher.add_pattern("HelloWorld", pattern, on_match=None)
doc = nlp(u'hello world!')
matches = matcher(doc)
+infobox("Important note")
| By default, the matcher #[strong does not perform any action] on matches,
| like tagging matched phrases with entity types. Instead, actions need to
| be specified when #[strong adding patterns or entities], by
| passing in a callback function as the #[code on_match] argument on
| #[+api("matcher#add") #[code add]]. This allows you to define custom
| actions per pattern within the same matcher. For example, you might only
| want to merge some entity types, and set custom flags for other matched
| patterns. For more details and examples, see the usage workflow on
| #[+a("/docs/usage/rule-based-matching") rule-based matching].
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
@ -76,7 +92,7 @@ p Match a stream of documents, yielding them in turn.
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
+cell #[code docs] +cell #[code docs]
+cell - +cell iterable
+cell A stream of documents. +cell A stream of documents.
+row +row
@ -97,83 +113,44 @@ p Match a stream of documents, yielding them in turn.
+cell #[code Doc] +cell #[code Doc]
+cell Documents, in order. +cell Documents, in order.
+h(2, "add_entity") Matcher.add_entity +h(2, "add_pattern") Matcher.add
+tag method +tag method
p Add an entity to the matcher. p
| Add one or more patterns to the matcher, along with a callback function
| to handle the matches. The callback function will receive the arguments
| #[code matcher], #[code doc], #[code id] and #[code matches].
+aside-code("Example").
from spacy.matcher import Matcher
from spacy.attrs import LOWER, ORTH
def on_match(matcher, doc, id, matches):
print('Matched!', matches)
matcher = Matcher(nlp.vocab)
matcher.add('HelloWorld', [{LOWER: "hello"}, {LOWER: "world"}], on_match=on_match)
matcher.add('GoogleMaps', [{ORTH: "Google"}, {ORTH: "Maps"}], on_match=on_match)
doc = nlp(u'HELLO WORLD on Google Maps.')
matches = matcher(doc)
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
+cell #[code entity_key] +cell #[code match_id]
+cell unicode / int
+cell An ID for the entity.
+row
+cell #[code attrs]
+cell -
+cell Attributes to associate with the Matcher.
+row
+cell #[code if_exists]
+cell unicode +cell unicode
+cell +cell An ID for the thing you're matching.
| #[code 'raise'], #[code 'ignore'] or #[code 'update']. Controls
| what happens if the entity ID already exists. Defaults to
| #[code 'raise'].
+row +row
+cell #[code acceptor] +cell #[code *patterns]
+cell - +cell list
+cell Callback function to filter matches of the entity. +cell
| Match pattern. A pattern consists of a list of dicts, where each
| dict describes a token.
+row +row
+cell #[code on_match] +cell #[code on_match]
+cell - +cell function
+cell Callback function to act on matches of the entity. +cell
| Callback function to act on matches. Takes the arguments
+footrow | #[code matcher], #[code doc], #[code id] and #[code matches].
+cell returns
+cell #[code None]
+cell -
+h(2, "add_pattern") Matcher.add_pattern
+tag method
p Add a pattern to the matcher.
+table(["Name", "Type", "Description"])
+row
+cell #[code entity_key]
+cell unicode / int
+cell An ID for the entity.
+row
+cell #[code token_specs]
+cell -
+cell Description of the pattern to be matched.
+row
+cell #[code label]
+cell unicode / int
+cell Label to assign to the matched pattern. Defaults to #[code ""].
+footrow
+cell returns
+cell #[code None]
+cell -
+h(2, "has_entity") Matcher.has_entity
+tag method
p Check whether the matcher has an entity.
+table(["Name", "Type", "Description"])
+row
+cell #[code entity_key]
+cell unicode / int
+cell The entity key to check.
+footrow
+cell returns
+cell bool
+cell Whether the matcher has the entity.