mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Update Matcher docstrings and API docs
This commit is contained in:
		
							parent
							
								
									c8580da686
								
							
						
					
					
						commit
						fe5d8819ea
					
				|  | @ -159,14 +159,14 @@ def _convert_strings(token_specs, string_store): | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def merge_phrase(matcher, doc, i, matches): | def merge_phrase(matcher, doc, i, matches): | ||||||
|     '''Callback to merge a phrase on match''' |     """Callback to merge a phrase on match.""" | ||||||
|     ent_id, label, start, end = matches[i] |     ent_id, label, start, end = matches[i] | ||||||
|     span = doc[start : end] |     span = doc[start : end] | ||||||
|     span.merge(ent_type=label, ent_id=ent_id) |     span.merge(ent_type=label, ent_id=ent_id) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef class Matcher: | cdef class Matcher: | ||||||
|     '''Match sequences of tokens, based on pattern rules.''' |     """Match sequences of tokens, based on pattern rules.""" | ||||||
|     cdef Pool mem |     cdef Pool mem | ||||||
|     cdef vector[TokenPatternC*] patterns |     cdef vector[TokenPatternC*] patterns | ||||||
|     cdef readonly Vocab vocab |     cdef readonly Vocab vocab | ||||||
|  | @ -175,37 +175,13 @@ cdef class Matcher: | ||||||
|     cdef public object _callbacks |     cdef public object _callbacks | ||||||
|     cdef public object _acceptors |     cdef public object _acceptors | ||||||
| 
 | 
 | ||||||
|     @classmethod |  | ||||||
|     def load(cls, path, vocab): |  | ||||||
|         """ |  | ||||||
|         Load the matcher and patterns from a file path. |  | ||||||
| 
 |  | ||||||
|         Arguments: |  | ||||||
|             path (Path): |  | ||||||
|                 Path to a JSON-formatted patterns file. |  | ||||||
|             vocab (Vocab): |  | ||||||
|                 The vocabulary that the documents to match over will refer to. |  | ||||||
|         Returns: |  | ||||||
|             Matcher: The newly constructed object. |  | ||||||
|         """ |  | ||||||
|         if (path / 'gazetteer.json').exists(): |  | ||||||
|             with (path / 'gazetteer.json').open('r', encoding='utf8') as file_: |  | ||||||
|                 patterns = ujson.load(file_) |  | ||||||
|         else: |  | ||||||
|             patterns = {} |  | ||||||
|         return cls(vocab, patterns) |  | ||||||
| 
 |  | ||||||
|     def __init__(self, vocab, patterns={}): |     def __init__(self, vocab, patterns={}): | ||||||
|         """ |         """Create the Matcher. | ||||||
|         Create the Matcher. |  | ||||||
| 
 | 
 | ||||||
|         Arguments: |         vocab (Vocab): The vocabulary object, which must be shared with the | ||||||
|             vocab (Vocab): |             documents the matcher will operate on. | ||||||
|                 The vocabulary object, which must be shared with the documents |         patterns (dict): Patterns to add to the matcher. | ||||||
|                 the matcher will operate on. |         RETURNS (Matcher): The newly constructed object. | ||||||
|             patterns (dict): Patterns to add to the matcher. |  | ||||||
|         Returns: |  | ||||||
|             The newly constructed object. |  | ||||||
|         """ |         """ | ||||||
|         self._patterns = {} |         self._patterns = {} | ||||||
|         self._entities = {} |         self._entities = {} | ||||||
|  | @ -226,22 +202,15 @@ cdef class Matcher: | ||||||
| 
 | 
 | ||||||
|     def add_entity(self, entity_key, attrs=None, if_exists='raise', |     def add_entity(self, entity_key, attrs=None, if_exists='raise', | ||||||
|                    acceptor=None, on_match=None): |                    acceptor=None, on_match=None): | ||||||
|         """ |         # TODO: replace with new Matcher.add() | ||||||
|         Add an entity to the matcher. |         """Add an entity to the matcher. | ||||||
| 
 | 
 | ||||||
|         Arguments: |         entity_key (unicode or int): An ID for the entity. | ||||||
|             entity_key (unicode or int): |         attrs (dict): Attributes to associate with the `Matcher`. | ||||||
|                 An ID for the entity. |         if_exists (unicode): `'raise'`, `'ignore'` or `'update'`. Controls what | ||||||
|             attrs: |             happens if the entity ID already exists. Defaults to `'raise'`. | ||||||
|                 Attributes to associate with the Matcher. |         acceptor (function): Callback function to filter matches of the entity. | ||||||
|             if_exists ('raise', 'ignore' or 'update'): |         on_match (function): Callback function to act on matches of the entity. | ||||||
|                 Controls what happens if the entity ID already exists. Defaults to 'raise'. |  | ||||||
|             acceptor: |  | ||||||
|                 Callback function to filter matches of the entity. |  | ||||||
|             on_match: |  | ||||||
|                 Callback function to act on matches of the entity. |  | ||||||
|         Returns: |  | ||||||
|             None |  | ||||||
|         """ |         """ | ||||||
|         if if_exists not in ('raise', 'ignore', 'update'): |         if if_exists not in ('raise', 'ignore', 'update'): | ||||||
|             raise ValueError( |             raise ValueError( | ||||||
|  | @ -264,18 +233,12 @@ cdef class Matcher: | ||||||
|         self._callbacks[entity_key] = on_match |         self._callbacks[entity_key] = on_match | ||||||
| 
 | 
 | ||||||
|     def add_pattern(self, entity_key, token_specs, label=""): |     def add_pattern(self, entity_key, token_specs, label=""): | ||||||
|         """ |         # TODO: replace with new Matcher.add() | ||||||
|         Add a pattern to the matcher. |         """Add a pattern to the matcher. | ||||||
| 
 | 
 | ||||||
|         Arguments: |         entity_key (unicode): An ID for the entity. | ||||||
|             entity_key (unicode or int): |         token_specs (list): Description of the pattern to be matched. | ||||||
|                 An ID for the entity. |         label (unicode): Label to assign to the matched pattern. Defaults to `""`. | ||||||
|             token_specs: |  | ||||||
|                 Description of the pattern to be matched. |  | ||||||
|             label: |  | ||||||
|                 Label to assign to the matched pattern. Defaults to "". |  | ||||||
|         Returns: |  | ||||||
|             None |  | ||||||
|         """ |         """ | ||||||
|         token_specs = list(token_specs) |         token_specs = list(token_specs) | ||||||
|         if len(token_specs) == 0: |         if len(token_specs) == 0: | ||||||
|  | @ -296,6 +259,7 @@ cdef class Matcher: | ||||||
|         self._patterns[entity_key].append((label, token_specs)) |         self._patterns[entity_key].append((label, token_specs)) | ||||||
| 
 | 
 | ||||||
|     def add(self, entity_key, label, attrs, specs, acceptor=None, on_match=None): |     def add(self, entity_key, label, attrs, specs, acceptor=None, on_match=None): | ||||||
|  |         # TODO: replace with new Matcher.add() | ||||||
|         self.add_entity(entity_key, attrs=attrs, if_exists='update', |         self.add_entity(entity_key, attrs=attrs, if_exists='update', | ||||||
|                         acceptor=acceptor, on_match=on_match) |                         acceptor=acceptor, on_match=on_match) | ||||||
|         for spec in specs: |         for spec in specs: | ||||||
|  | @ -308,25 +272,21 @@ cdef class Matcher: | ||||||
|             return entity_key |             return entity_key | ||||||
| 
 | 
 | ||||||
|     def has_entity(self, entity_key): |     def has_entity(self, entity_key): | ||||||
|         """ |         # TODO: deprecate | ||||||
|         Check whether the matcher has an entity. |         """Check whether the matcher has an entity. | ||||||
| 
 | 
 | ||||||
|         Arguments: |         entity_key (string or int): The entity key to check. | ||||||
|             entity_key (string or int): The entity key to check. |         RETURNS (bool): Whether the matcher has the entity. | ||||||
|         Returns: |  | ||||||
|             bool: Whether the matcher has the entity. |  | ||||||
|         """ |         """ | ||||||
|         entity_key = self.normalize_entity_key(entity_key) |         entity_key = self.normalize_entity_key(entity_key) | ||||||
|         return entity_key in self._entities |         return entity_key in self._entities | ||||||
| 
 | 
 | ||||||
|     def get_entity(self, entity_key): |     def get_entity(self, entity_key): | ||||||
|         """ |         # TODO: deprecate | ||||||
|         Retrieve the attributes stored for an entity. |         """Retrieve the attributes stored for an entity. | ||||||
| 
 | 
 | ||||||
|         Arguments: |         entity_key (unicode or int): The entity to retrieve. | ||||||
|             entity_key (unicode or int): The entity to retrieve. |         RETURNS (dict): The entity attributes if present, otherwise None. | ||||||
|         Returns: |  | ||||||
|             The entity attributes if present, otherwise None. |  | ||||||
|         """ |         """ | ||||||
|         entity_key = self.normalize_entity_key(entity_key) |         entity_key = self.normalize_entity_key(entity_key) | ||||||
|         if entity_key in self._entities: |         if entity_key in self._entities: | ||||||
|  | @ -335,17 +295,12 @@ cdef class Matcher: | ||||||
|             return None |             return None | ||||||
| 
 | 
 | ||||||
|     def __call__(self, Doc doc, acceptor=None): |     def __call__(self, Doc doc, acceptor=None): | ||||||
|         """ |         """Find all token sequences matching the supplied patterns on the `Doc`. | ||||||
|         Find all token sequences matching the supplied patterns on the Doc. |  | ||||||
| 
 | 
 | ||||||
|         Arguments: |         doc (Doc): The document to match over. | ||||||
|             doc (Doc): |         RETURNS (list): A list of `(entity_key, label_id, start, end)` tuples, | ||||||
|                 The document to match over. |             describing the matches. A match tuple describes a span | ||||||
|         Returns: |             `doc[start:end]`. The `label_id` and `entity_key` are both integers. | ||||||
|             list |  | ||||||
|             A list of (entity_key, label_id, start, end) tuples, |  | ||||||
|             describing the matches. A match tuple describes a span doc[start:end]. |  | ||||||
|             The label_id and entity_key are both integers. |  | ||||||
|         """ |         """ | ||||||
|         if acceptor is not None: |         if acceptor is not None: | ||||||
|             raise ValueError( |             raise ValueError( | ||||||
|  | @ -449,18 +404,13 @@ cdef class Matcher: | ||||||
|         return matches |         return matches | ||||||
| 
 | 
 | ||||||
|     def pipe(self, docs, batch_size=1000, n_threads=2): |     def pipe(self, docs, batch_size=1000, n_threads=2): | ||||||
|         """ |         """Match a stream of documents, yielding them in turn. | ||||||
|         Match a stream of documents, yielding them in turn. |  | ||||||
| 
 | 
 | ||||||
|         Arguments: |         docs (iterable): A stream of documents. | ||||||
|             docs: A stream of documents. |         batch_size (int): The number of documents to accumulate into a working set. | ||||||
|             batch_size (int): |         n_threads (int): The number of threads with which to work on the buffer | ||||||
|                 The number of documents to accumulate into a working set. |             in parallel, if the `Matcher` implementation supports multi-threading. | ||||||
|             n_threads (int): |         YIELDS (Doc): Documents, in order. | ||||||
|                 The number of threads with which to work on the buffer in parallel, |  | ||||||
|                 if the Matcher implementation supports multi-threading. |  | ||||||
|         Yields: |  | ||||||
|             Doc Documents, in order. |  | ||||||
|         """ |         """ | ||||||
|         for doc in docs: |         for doc in docs: | ||||||
|             self(doc) |             self(doc) | ||||||
|  |  | ||||||
|  | @ -4,31 +4,26 @@ include ../../_includes/_mixins | ||||||
| 
 | 
 | ||||||
| p Match sequences of tokens, based on pattern rules. | p Match sequences of tokens, based on pattern rules. | ||||||
| 
 | 
 | ||||||
| +h(2, "load") Matcher.load | +infobox("⚠️ Deprecation note") | ||||||
|     +tag classmethod |     |  As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity] | ||||||
| 
 |     |  are deprecated and have been replaced with a simpler | ||||||
| p Load the matcher and patterns from a file path. |     |  #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of | ||||||
| 
 |     |  patterns and a callback for a given match ID. | ||||||
| +table(["Name", "Type", "Description"]) |     |  #[code Matcher.load] (not useful, as it didn't allow specifying callbacks), | ||||||
|     +row |     |  #[code Matcher.has_entity] and #[code Matcher.get_entity] (now redundant) | ||||||
|         +cell #[code path] |     |  have been removed. | ||||||
|         +cell #[code Path] |  | ||||||
|         +cell Path to a JSON-formatted patterns file. |  | ||||||
| 
 |  | ||||||
|     +row |  | ||||||
|         +cell #[code vocab] |  | ||||||
|         +cell #[code Vocab] |  | ||||||
|         +cell The vocabulary that the documents to match over will refer to. |  | ||||||
| 
 |  | ||||||
|     +footrow |  | ||||||
|         +cell returns |  | ||||||
|         +cell #[code Matcher] |  | ||||||
|         +cell The newly constructed object. |  | ||||||
| 
 | 
 | ||||||
| +h(2, "init") Matcher.__init__ | +h(2, "init") Matcher.__init__ | ||||||
|     +tag method |     +tag method | ||||||
| 
 | 
 | ||||||
| p Create the Matcher. | p Create the rule-based #[code Matcher]. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     from spacy.matcher import Matcher | ||||||
|  |     from spacy.attrs import LOWER | ||||||
|  | 
 | ||||||
|  |     patterns = {"HelloWorld": [{LOWER: "hello"}, {LOWER: "world"}]} | ||||||
|  |     matcher = Matcher(nlp.vocab) | ||||||
| 
 | 
 | ||||||
| +table(["Name", "Type", "Description"]) | +table(["Name", "Type", "Description"]) | ||||||
|     +row |     +row | ||||||
|  | @ -41,7 +36,7 @@ p Create the Matcher. | ||||||
|     +row |     +row | ||||||
|         +cell #[code patterns] |         +cell #[code patterns] | ||||||
|         +cell dict |         +cell dict | ||||||
|         +cell Patterns to add to the matcher. |         +cell Patterns to add to the matcher, keyed by ID. | ||||||
| 
 | 
 | ||||||
|     +footrow |     +footrow | ||||||
|         +cell returns |         +cell returns | ||||||
|  | @ -51,7 +46,28 @@ p Create the Matcher. | ||||||
| +h(2, "call") Matcher.__call__ | +h(2, "call") Matcher.__call__ | ||||||
|     +tag method |     +tag method | ||||||
| 
 | 
 | ||||||
| p Find all token sequences matching the supplied patterns on the Doc. | p Find all token sequences matching the supplied patterns on the #[code Doc]. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     from spacy.matcher import Matcher | ||||||
|  |     from spacy.attrs import LOWER | ||||||
|  | 
 | ||||||
|  |     matcher = Matcher(nlp.vocab) | ||||||
|  |     pattern = [{LOWER: "hello"}, {LOWER: "world"}] | ||||||
|  |     matcher.add_pattern("HelloWorld", pattern, on_match=None) | ||||||
|  |     doc = nlp(u'hello world!') | ||||||
|  |     matches = matcher(doc) | ||||||
|  | 
 | ||||||
|  | +infobox("Important note") | ||||||
|  |     |  By default, the matcher #[strong does not perform any action] on matches, | ||||||
|  |     |  like tagging matched phrases with entity types. Instead, actions need to | ||||||
|  |     |  be specified when #[strong adding patterns or entities], by | ||||||
|  |     |  passing in a callback function as the #[code on_match] argument on | ||||||
|  |     |  #[+api("matcher#add") #[code add]]. This allows you to define custom | ||||||
|  |     |  actions per pattern within the same matcher. For example, you might only | ||||||
|  |     |  want to merge some entity types, and set custom flags for other matched | ||||||
|  |     |  patterns. For more details and examples, see the usage workflow on | ||||||
|  |     |  #[+a("/docs/usage/rule-based-matching") rule-based matching]. | ||||||
| 
 | 
 | ||||||
| +table(["Name", "Type", "Description"]) | +table(["Name", "Type", "Description"]) | ||||||
|     +row |     +row | ||||||
|  | @ -76,7 +92,7 @@ p Match a stream of documents, yielding them in turn. | ||||||
| +table(["Name", "Type", "Description"]) | +table(["Name", "Type", "Description"]) | ||||||
|     +row |     +row | ||||||
|         +cell #[code docs] |         +cell #[code docs] | ||||||
|         +cell - |         +cell iterable | ||||||
|         +cell A stream of documents. |         +cell A stream of documents. | ||||||
| 
 | 
 | ||||||
|     +row |     +row | ||||||
|  | @ -97,83 +113,44 @@ p Match a stream of documents, yielding them in turn. | ||||||
|         +cell #[code Doc] |         +cell #[code Doc] | ||||||
|         +cell Documents, in order. |         +cell Documents, in order. | ||||||
| 
 | 
 | ||||||
| +h(2, "add_entity") Matcher.add_entity | +h(2, "add_pattern") Matcher.add | ||||||
|     +tag method |     +tag method | ||||||
| 
 | 
 | ||||||
| p Add an entity to the matcher. | p | ||||||
|  |     |  Add one or more patterns to the matcher, along with a callback function | ||||||
|  |     |  to handle the matches. The callback function will receive the arguments | ||||||
|  |     |  #[code matcher], #[code doc], #[code id] and #[code matches]. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     from spacy.matcher import Matcher | ||||||
|  |     from spacy.attrs import LOWER, ORTH | ||||||
|  | 
 | ||||||
|  |     def on_match(matcher, doc, id, matches): | ||||||
|  |         print('Matched!', matches) | ||||||
|  | 
 | ||||||
|  |     matcher = Matcher(nlp.vocab) | ||||||
|  |     matcher.add('HelloWorld', [{LOWER: "hello"}, {LOWER: "world"}], on_match=on_match) | ||||||
|  |     matcher.add('GoogleMaps', [{ORTH: "Google"}, {ORTH: "Maps"}], on_match=on_match) | ||||||
|  | 
 | ||||||
|  |     doc = nlp(u'HELLO WORLD on Google Maps.') | ||||||
|  |     matches = matcher(doc) | ||||||
| 
 | 
 | ||||||
| +table(["Name", "Type", "Description"]) | +table(["Name", "Type", "Description"]) | ||||||
|     +row |     +row | ||||||
|         +cell #[code entity_key] |         +cell #[code match_id] | ||||||
|         +cell unicode / int |  | ||||||
|         +cell An ID for the entity. |  | ||||||
| 
 |  | ||||||
|     +row |  | ||||||
|         +cell #[code attrs] |  | ||||||
|         +cell - |  | ||||||
|         +cell Attributes to associate with the Matcher. |  | ||||||
| 
 |  | ||||||
|     +row |  | ||||||
|         +cell #[code if_exists] |  | ||||||
|         +cell unicode |         +cell unicode | ||||||
|         +cell |         +cell An ID for the thing you're matching. | ||||||
|             |  #[code 'raise'], #[code 'ignore'] or #[code 'update']. Controls |  | ||||||
|             |  what happens if the entity ID already exists. Defaults to |  | ||||||
|             |  #[code 'raise']. |  | ||||||
| 
 | 
 | ||||||
|     +row |     +row | ||||||
|         +cell #[code acceptor] |         +cell #[code *patterns] | ||||||
|         +cell - |         +cell list | ||||||
|         +cell Callback function to filter matches of the entity. |         +cell | ||||||
|  |             |  Match pattern. A pattern consists of a list of dicts, where each | ||||||
|  |             |  dict describes a token. | ||||||
| 
 | 
 | ||||||
|     +row |     +row | ||||||
|         +cell #[code on_match] |         +cell #[code on_match] | ||||||
|         +cell - |         +cell function | ||||||
|         +cell Callback function to act on matches of the entity. |         +cell | ||||||
| 
 |             |  Callback function to act on matches. Takes the arguments | ||||||
|     +footrow |             |  #[code matcher], #[code doc], #[code id] and #[code matches]. | ||||||
|         +cell returns |  | ||||||
|         +cell #[code None] |  | ||||||
|         +cell - |  | ||||||
| 
 |  | ||||||
| +h(2, "add_pattern") Matcher.add_pattern |  | ||||||
|     +tag method |  | ||||||
| 
 |  | ||||||
| p Add a pattern to the matcher. |  | ||||||
| 
 |  | ||||||
| +table(["Name", "Type", "Description"]) |  | ||||||
|     +row |  | ||||||
|         +cell #[code entity_key] |  | ||||||
|         +cell unicode / int |  | ||||||
|         +cell An ID for the entity. |  | ||||||
| 
 |  | ||||||
|     +row |  | ||||||
|         +cell #[code token_specs] |  | ||||||
|         +cell - |  | ||||||
|         +cell Description of the pattern to be matched. |  | ||||||
| 
 |  | ||||||
|     +row |  | ||||||
|         +cell #[code label] |  | ||||||
|         +cell unicode / int |  | ||||||
|         +cell Label to assign to the matched pattern. Defaults to #[code ""]. |  | ||||||
| 
 |  | ||||||
|     +footrow |  | ||||||
|         +cell returns |  | ||||||
|         +cell #[code None] |  | ||||||
|         +cell - |  | ||||||
| 
 |  | ||||||
| +h(2, "has_entity") Matcher.has_entity |  | ||||||
|     +tag method |  | ||||||
| 
 |  | ||||||
| p Check whether the matcher has an entity. |  | ||||||
| 
 |  | ||||||
| +table(["Name", "Type", "Description"]) |  | ||||||
|     +row |  | ||||||
|         +cell #[code entity_key] |  | ||||||
|         +cell unicode / int |  | ||||||
|         +cell The entity key to check. |  | ||||||
| 
 |  | ||||||
|     +footrow |  | ||||||
|         +cell returns |  | ||||||
|         +cell bool |  | ||||||
|         +cell Whether the matcher has the entity. |  | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user