mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Update Matcher docstrings and API docs
This commit is contained in:
parent
c8580da686
commit
fe5d8819ea
|
@ -159,14 +159,14 @@ def _convert_strings(token_specs, string_store):
|
||||||
|
|
||||||
|
|
||||||
def merge_phrase(matcher, doc, i, matches):
|
def merge_phrase(matcher, doc, i, matches):
|
||||||
'''Callback to merge a phrase on match'''
|
"""Callback to merge a phrase on match."""
|
||||||
ent_id, label, start, end = matches[i]
|
ent_id, label, start, end = matches[i]
|
||||||
span = doc[start : end]
|
span = doc[start : end]
|
||||||
span.merge(ent_type=label, ent_id=ent_id)
|
span.merge(ent_type=label, ent_id=ent_id)
|
||||||
|
|
||||||
|
|
||||||
cdef class Matcher:
|
cdef class Matcher:
|
||||||
'''Match sequences of tokens, based on pattern rules.'''
|
"""Match sequences of tokens, based on pattern rules."""
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cdef vector[TokenPatternC*] patterns
|
cdef vector[TokenPatternC*] patterns
|
||||||
cdef readonly Vocab vocab
|
cdef readonly Vocab vocab
|
||||||
|
@ -175,37 +175,13 @@ cdef class Matcher:
|
||||||
cdef public object _callbacks
|
cdef public object _callbacks
|
||||||
cdef public object _acceptors
|
cdef public object _acceptors
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def load(cls, path, vocab):
|
|
||||||
"""
|
|
||||||
Load the matcher and patterns from a file path.
|
|
||||||
|
|
||||||
Arguments:
|
|
||||||
path (Path):
|
|
||||||
Path to a JSON-formatted patterns file.
|
|
||||||
vocab (Vocab):
|
|
||||||
The vocabulary that the documents to match over will refer to.
|
|
||||||
Returns:
|
|
||||||
Matcher: The newly constructed object.
|
|
||||||
"""
|
|
||||||
if (path / 'gazetteer.json').exists():
|
|
||||||
with (path / 'gazetteer.json').open('r', encoding='utf8') as file_:
|
|
||||||
patterns = ujson.load(file_)
|
|
||||||
else:
|
|
||||||
patterns = {}
|
|
||||||
return cls(vocab, patterns)
|
|
||||||
|
|
||||||
def __init__(self, vocab, patterns={}):
|
def __init__(self, vocab, patterns={}):
|
||||||
"""
|
"""Create the Matcher.
|
||||||
Create the Matcher.
|
|
||||||
|
|
||||||
Arguments:
|
vocab (Vocab): The vocabulary object, which must be shared with the
|
||||||
vocab (Vocab):
|
documents the matcher will operate on.
|
||||||
The vocabulary object, which must be shared with the documents
|
patterns (dict): Patterns to add to the matcher.
|
||||||
the matcher will operate on.
|
RETURNS (Matcher): The newly constructed object.
|
||||||
patterns (dict): Patterns to add to the matcher.
|
|
||||||
Returns:
|
|
||||||
The newly constructed object.
|
|
||||||
"""
|
"""
|
||||||
self._patterns = {}
|
self._patterns = {}
|
||||||
self._entities = {}
|
self._entities = {}
|
||||||
|
@ -226,22 +202,15 @@ cdef class Matcher:
|
||||||
|
|
||||||
def add_entity(self, entity_key, attrs=None, if_exists='raise',
|
def add_entity(self, entity_key, attrs=None, if_exists='raise',
|
||||||
acceptor=None, on_match=None):
|
acceptor=None, on_match=None):
|
||||||
"""
|
# TODO: replace with new Matcher.add()
|
||||||
Add an entity to the matcher.
|
"""Add an entity to the matcher.
|
||||||
|
|
||||||
Arguments:
|
entity_key (unicode or int): An ID for the entity.
|
||||||
entity_key (unicode or int):
|
attrs (dict): Attributes to associate with the `Matcher`.
|
||||||
An ID for the entity.
|
if_exists (unicode): `'raise'`, `'ignore'` or `'update'`. Controls what
|
||||||
attrs:
|
happens if the entity ID already exists. Defaults to `'raise'`.
|
||||||
Attributes to associate with the Matcher.
|
acceptor (function): Callback function to filter matches of the entity.
|
||||||
if_exists ('raise', 'ignore' or 'update'):
|
on_match (function): Callback function to act on matches of the entity.
|
||||||
Controls what happens if the entity ID already exists. Defaults to 'raise'.
|
|
||||||
acceptor:
|
|
||||||
Callback function to filter matches of the entity.
|
|
||||||
on_match:
|
|
||||||
Callback function to act on matches of the entity.
|
|
||||||
Returns:
|
|
||||||
None
|
|
||||||
"""
|
"""
|
||||||
if if_exists not in ('raise', 'ignore', 'update'):
|
if if_exists not in ('raise', 'ignore', 'update'):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
@ -264,18 +233,12 @@ cdef class Matcher:
|
||||||
self._callbacks[entity_key] = on_match
|
self._callbacks[entity_key] = on_match
|
||||||
|
|
||||||
def add_pattern(self, entity_key, token_specs, label=""):
|
def add_pattern(self, entity_key, token_specs, label=""):
|
||||||
"""
|
# TODO: replace with new Matcher.add()
|
||||||
Add a pattern to the matcher.
|
"""Add a pattern to the matcher.
|
||||||
|
|
||||||
Arguments:
|
entity_key (unicode): An ID for the entity.
|
||||||
entity_key (unicode or int):
|
token_specs (list): Description of the pattern to be matched.
|
||||||
An ID for the entity.
|
label (unicode): Label to assign to the matched pattern. Defaults to `""`.
|
||||||
token_specs:
|
|
||||||
Description of the pattern to be matched.
|
|
||||||
label:
|
|
||||||
Label to assign to the matched pattern. Defaults to "".
|
|
||||||
Returns:
|
|
||||||
None
|
|
||||||
"""
|
"""
|
||||||
token_specs = list(token_specs)
|
token_specs = list(token_specs)
|
||||||
if len(token_specs) == 0:
|
if len(token_specs) == 0:
|
||||||
|
@ -296,6 +259,7 @@ cdef class Matcher:
|
||||||
self._patterns[entity_key].append((label, token_specs))
|
self._patterns[entity_key].append((label, token_specs))
|
||||||
|
|
||||||
def add(self, entity_key, label, attrs, specs, acceptor=None, on_match=None):
|
def add(self, entity_key, label, attrs, specs, acceptor=None, on_match=None):
|
||||||
|
# TODO: replace with new Matcher.add()
|
||||||
self.add_entity(entity_key, attrs=attrs, if_exists='update',
|
self.add_entity(entity_key, attrs=attrs, if_exists='update',
|
||||||
acceptor=acceptor, on_match=on_match)
|
acceptor=acceptor, on_match=on_match)
|
||||||
for spec in specs:
|
for spec in specs:
|
||||||
|
@ -308,25 +272,21 @@ cdef class Matcher:
|
||||||
return entity_key
|
return entity_key
|
||||||
|
|
||||||
def has_entity(self, entity_key):
|
def has_entity(self, entity_key):
|
||||||
"""
|
# TODO: deprecate
|
||||||
Check whether the matcher has an entity.
|
"""Check whether the matcher has an entity.
|
||||||
|
|
||||||
Arguments:
|
entity_key (string or int): The entity key to check.
|
||||||
entity_key (string or int): The entity key to check.
|
RETURNS (bool): Whether the matcher has the entity.
|
||||||
Returns:
|
|
||||||
bool: Whether the matcher has the entity.
|
|
||||||
"""
|
"""
|
||||||
entity_key = self.normalize_entity_key(entity_key)
|
entity_key = self.normalize_entity_key(entity_key)
|
||||||
return entity_key in self._entities
|
return entity_key in self._entities
|
||||||
|
|
||||||
def get_entity(self, entity_key):
|
def get_entity(self, entity_key):
|
||||||
"""
|
# TODO: deprecate
|
||||||
Retrieve the attributes stored for an entity.
|
"""Retrieve the attributes stored for an entity.
|
||||||
|
|
||||||
Arguments:
|
entity_key (unicode or int): The entity to retrieve.
|
||||||
entity_key (unicode or int): The entity to retrieve.
|
RETURNS (dict): The entity attributes if present, otherwise None.
|
||||||
Returns:
|
|
||||||
The entity attributes if present, otherwise None.
|
|
||||||
"""
|
"""
|
||||||
entity_key = self.normalize_entity_key(entity_key)
|
entity_key = self.normalize_entity_key(entity_key)
|
||||||
if entity_key in self._entities:
|
if entity_key in self._entities:
|
||||||
|
@ -335,17 +295,12 @@ cdef class Matcher:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def __call__(self, Doc doc, acceptor=None):
|
def __call__(self, Doc doc, acceptor=None):
|
||||||
"""
|
"""Find all token sequences matching the supplied patterns on the `Doc`.
|
||||||
Find all token sequences matching the supplied patterns on the Doc.
|
|
||||||
|
|
||||||
Arguments:
|
doc (Doc): The document to match over.
|
||||||
doc (Doc):
|
RETURNS (list): A list of `(entity_key, label_id, start, end)` tuples,
|
||||||
The document to match over.
|
describing the matches. A match tuple describes a span
|
||||||
Returns:
|
`doc[start:end]`. The `label_id` and `entity_key` are both integers.
|
||||||
list
|
|
||||||
A list of (entity_key, label_id, start, end) tuples,
|
|
||||||
describing the matches. A match tuple describes a span doc[start:end].
|
|
||||||
The label_id and entity_key are both integers.
|
|
||||||
"""
|
"""
|
||||||
if acceptor is not None:
|
if acceptor is not None:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
@ -449,18 +404,13 @@ cdef class Matcher:
|
||||||
return matches
|
return matches
|
||||||
|
|
||||||
def pipe(self, docs, batch_size=1000, n_threads=2):
|
def pipe(self, docs, batch_size=1000, n_threads=2):
|
||||||
"""
|
"""Match a stream of documents, yielding them in turn.
|
||||||
Match a stream of documents, yielding them in turn.
|
|
||||||
|
|
||||||
Arguments:
|
docs (iterable): A stream of documents.
|
||||||
docs: A stream of documents.
|
batch_size (int): The number of documents to accumulate into a working set.
|
||||||
batch_size (int):
|
n_threads (int): The number of threads with which to work on the buffer
|
||||||
The number of documents to accumulate into a working set.
|
in parallel, if the `Matcher` implementation supports multi-threading.
|
||||||
n_threads (int):
|
YIELDS (Doc): Documents, in order.
|
||||||
The number of threads with which to work on the buffer in parallel,
|
|
||||||
if the Matcher implementation supports multi-threading.
|
|
||||||
Yields:
|
|
||||||
Doc Documents, in order.
|
|
||||||
"""
|
"""
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
self(doc)
|
self(doc)
|
||||||
|
|
|
@ -4,31 +4,26 @@ include ../../_includes/_mixins
|
||||||
|
|
||||||
p Match sequences of tokens, based on pattern rules.
|
p Match sequences of tokens, based on pattern rules.
|
||||||
|
|
||||||
+h(2, "load") Matcher.load
|
+infobox("⚠️ Deprecation note")
|
||||||
+tag classmethod
|
| As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
|
||||||
|
| are deprecated and have been replaced with a simpler
|
||||||
p Load the matcher and patterns from a file path.
|
| #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
|
||||||
|
| patterns and a callback for a given match ID.
|
||||||
+table(["Name", "Type", "Description"])
|
| #[code Matcher.load] (not useful, as it didn't allow specifying callbacks),
|
||||||
+row
|
| #[code Matcher.has_entity] and #[code Matcher.get_entity] (now redundant)
|
||||||
+cell #[code path]
|
| have been removed.
|
||||||
+cell #[code Path]
|
|
||||||
+cell Path to a JSON-formatted patterns file.
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code vocab]
|
|
||||||
+cell #[code Vocab]
|
|
||||||
+cell The vocabulary that the documents to match over will refer to.
|
|
||||||
|
|
||||||
+footrow
|
|
||||||
+cell returns
|
|
||||||
+cell #[code Matcher]
|
|
||||||
+cell The newly constructed object.
|
|
||||||
|
|
||||||
+h(2, "init") Matcher.__init__
|
+h(2, "init") Matcher.__init__
|
||||||
+tag method
|
+tag method
|
||||||
|
|
||||||
p Create the Matcher.
|
p Create the rule-based #[code Matcher].
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
from spacy.matcher import Matcher
|
||||||
|
from spacy.attrs import LOWER
|
||||||
|
|
||||||
|
patterns = {"HelloWorld": [{LOWER: "hello"}, {LOWER: "world"}]}
|
||||||
|
matcher = Matcher(nlp.vocab)
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
|
@ -41,7 +36,7 @@ p Create the Matcher.
|
||||||
+row
|
+row
|
||||||
+cell #[code patterns]
|
+cell #[code patterns]
|
||||||
+cell dict
|
+cell dict
|
||||||
+cell Patterns to add to the matcher.
|
+cell Patterns to add to the matcher, keyed by ID.
|
||||||
|
|
||||||
+footrow
|
+footrow
|
||||||
+cell returns
|
+cell returns
|
||||||
|
@ -51,7 +46,28 @@ p Create the Matcher.
|
||||||
+h(2, "call") Matcher.__call__
|
+h(2, "call") Matcher.__call__
|
||||||
+tag method
|
+tag method
|
||||||
|
|
||||||
p Find all token sequences matching the supplied patterns on the Doc.
|
p Find all token sequences matching the supplied patterns on the #[code Doc].
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
from spacy.matcher import Matcher
|
||||||
|
from spacy.attrs import LOWER
|
||||||
|
|
||||||
|
matcher = Matcher(nlp.vocab)
|
||||||
|
pattern = [{LOWER: "hello"}, {LOWER: "world"}]
|
||||||
|
matcher.add_pattern("HelloWorld", pattern, on_match=None)
|
||||||
|
doc = nlp(u'hello world!')
|
||||||
|
matches = matcher(doc)
|
||||||
|
|
||||||
|
+infobox("Important note")
|
||||||
|
| By default, the matcher #[strong does not perform any action] on matches,
|
||||||
|
| like tagging matched phrases with entity types. Instead, actions need to
|
||||||
|
| be specified when #[strong adding patterns or entities], by
|
||||||
|
| passing in a callback function as the #[code on_match] argument on
|
||||||
|
| #[+api("matcher#add") #[code add]]. This allows you to define custom
|
||||||
|
| actions per pattern within the same matcher. For example, you might only
|
||||||
|
| want to merge some entity types, and set custom flags for other matched
|
||||||
|
| patterns. For more details and examples, see the usage workflow on
|
||||||
|
| #[+a("/docs/usage/rule-based-matching") rule-based matching].
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
|
@ -76,7 +92,7 @@ p Match a stream of documents, yielding them in turn.
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
+cell #[code docs]
|
+cell #[code docs]
|
||||||
+cell -
|
+cell iterable
|
||||||
+cell A stream of documents.
|
+cell A stream of documents.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
|
@ -97,83 +113,44 @@ p Match a stream of documents, yielding them in turn.
|
||||||
+cell #[code Doc]
|
+cell #[code Doc]
|
||||||
+cell Documents, in order.
|
+cell Documents, in order.
|
||||||
|
|
||||||
+h(2, "add_entity") Matcher.add_entity
|
+h(2, "add_pattern") Matcher.add
|
||||||
+tag method
|
+tag method
|
||||||
|
|
||||||
p Add an entity to the matcher.
|
p
|
||||||
|
| Add one or more patterns to the matcher, along with a callback function
|
||||||
|
| to handle the matches. The callback function will receive the arguments
|
||||||
|
| #[code matcher], #[code doc], #[code id] and #[code matches].
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
from spacy.matcher import Matcher
|
||||||
|
from spacy.attrs import LOWER, ORTH
|
||||||
|
|
||||||
|
def on_match(matcher, doc, id, matches):
|
||||||
|
print('Matched!', matches)
|
||||||
|
|
||||||
|
matcher = Matcher(nlp.vocab)
|
||||||
|
matcher.add('HelloWorld', [{LOWER: "hello"}, {LOWER: "world"}], on_match=on_match)
|
||||||
|
matcher.add('GoogleMaps', [{ORTH: "Google"}, {ORTH: "Maps"}], on_match=on_match)
|
||||||
|
|
||||||
|
doc = nlp(u'HELLO WORLD on Google Maps.')
|
||||||
|
matches = matcher(doc)
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
+cell #[code entity_key]
|
+cell #[code match_id]
|
||||||
+cell unicode / int
|
|
||||||
+cell An ID for the entity.
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code attrs]
|
|
||||||
+cell -
|
|
||||||
+cell Attributes to associate with the Matcher.
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code if_exists]
|
|
||||||
+cell unicode
|
+cell unicode
|
||||||
+cell
|
+cell An ID for the thing you're matching.
|
||||||
| #[code 'raise'], #[code 'ignore'] or #[code 'update']. Controls
|
|
||||||
| what happens if the entity ID already exists. Defaults to
|
|
||||||
| #[code 'raise'].
|
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code acceptor]
|
+cell #[code *patterns]
|
||||||
+cell -
|
+cell list
|
||||||
+cell Callback function to filter matches of the entity.
|
+cell
|
||||||
|
| Match pattern. A pattern consists of a list of dicts, where each
|
||||||
|
| dict describes a token.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code on_match]
|
+cell #[code on_match]
|
||||||
+cell -
|
+cell function
|
||||||
+cell Callback function to act on matches of the entity.
|
+cell
|
||||||
|
| Callback function to act on matches. Takes the arguments
|
||||||
+footrow
|
| #[code matcher], #[code doc], #[code id] and #[code matches].
|
||||||
+cell returns
|
|
||||||
+cell #[code None]
|
|
||||||
+cell -
|
|
||||||
|
|
||||||
+h(2, "add_pattern") Matcher.add_pattern
|
|
||||||
+tag method
|
|
||||||
|
|
||||||
p Add a pattern to the matcher.
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
|
||||||
+row
|
|
||||||
+cell #[code entity_key]
|
|
||||||
+cell unicode / int
|
|
||||||
+cell An ID for the entity.
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code token_specs]
|
|
||||||
+cell -
|
|
||||||
+cell Description of the pattern to be matched.
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code label]
|
|
||||||
+cell unicode / int
|
|
||||||
+cell Label to assign to the matched pattern. Defaults to #[code ""].
|
|
||||||
|
|
||||||
+footrow
|
|
||||||
+cell returns
|
|
||||||
+cell #[code None]
|
|
||||||
+cell -
|
|
||||||
|
|
||||||
+h(2, "has_entity") Matcher.has_entity
|
|
||||||
+tag method
|
|
||||||
|
|
||||||
p Check whether the matcher has an entity.
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
|
||||||
+row
|
|
||||||
+cell #[code entity_key]
|
|
||||||
+cell unicode / int
|
|
||||||
+cell The entity key to check.
|
|
||||||
|
|
||||||
+footrow
|
|
||||||
+cell returns
|
|
||||||
+cell bool
|
|
||||||
+cell Whether the matcher has the entity.
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user