mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-22 19:54:18 +03:00 
			
		
		
		
	Update docs and fix consistency
This commit is contained in:
		
							parent
							
								
									7c6854d8d4
								
							
						
					
					
						commit
						d5c78c7a34
					
				|  | @ -35,7 +35,7 @@ def pretrain_cli( | |||
|     config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False), | ||||
|     code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), | ||||
|     resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"), | ||||
|     epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."), | ||||
|     epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."), | ||||
|     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), | ||||
|     # fmt: on | ||||
| ): | ||||
|  |  | |||
|  | @ -68,11 +68,11 @@ cdef class DependencyMatcher: | |||
|         key (str): The match ID. | ||||
|         RETURNS (bool): Whether the matcher contains rules for this match ID. | ||||
|         """ | ||||
|         return self._normalize_key(key) in self._patterns | ||||
|         return self.has_key(key) | ||||
| 
 | ||||
|     def validateInput(self, pattern, key): | ||||
|     def validate_input(self, pattern, key): | ||||
|         idx = 0 | ||||
|         visitedNodes = {} | ||||
|         visited_nodes = {} | ||||
|         for relation in pattern: | ||||
|             if "PATTERN" not in relation or "SPEC" not in relation: | ||||
|                 raise ValueError(Errors.E098.format(key=key)) | ||||
|  | @ -83,7 +83,7 @@ cdef class DependencyMatcher: | |||
|                     and "NBOR_NAME" not in relation["SPEC"] | ||||
|                 ): | ||||
|                     raise ValueError(Errors.E099.format(key=key)) | ||||
|                 visitedNodes[relation["SPEC"]["NODE_NAME"]] = True | ||||
|                 visited_nodes[relation["SPEC"]["NODE_NAME"]] = True | ||||
|             else: | ||||
|                 if not( | ||||
|                     "NODE_NAME" in relation["SPEC"] | ||||
|  | @ -92,22 +92,28 @@ cdef class DependencyMatcher: | |||
|                 ): | ||||
|                     raise ValueError(Errors.E100.format(key=key)) | ||||
|                 if ( | ||||
|                     relation["SPEC"]["NODE_NAME"] in visitedNodes | ||||
|                     or relation["SPEC"]["NBOR_NAME"] not in visitedNodes | ||||
|                     relation["SPEC"]["NODE_NAME"] in visited_nodes | ||||
|                     or relation["SPEC"]["NBOR_NAME"] not in visited_nodes | ||||
|                 ): | ||||
|                     raise ValueError(Errors.E101.format(key=key)) | ||||
|                 visitedNodes[relation["SPEC"]["NODE_NAME"]] = True | ||||
|                 visitedNodes[relation["SPEC"]["NBOR_NAME"]] = True | ||||
|                 visited_nodes[relation["SPEC"]["NODE_NAME"]] = True | ||||
|                 visited_nodes[relation["SPEC"]["NBOR_NAME"]] = True | ||||
|             idx = idx + 1 | ||||
| 
 | ||||
|     def add(self, key, patterns, *_patterns, on_match=None): | ||||
|         """Add a new matcher rule to the matcher. | ||||
| 
 | ||||
|         key (str): The match ID. | ||||
|         patterns (list): The patterns to add for the given key. | ||||
|         on_match (callable): Optional callback executed on match. | ||||
|         """ | ||||
|         if patterns is None or hasattr(patterns, "__call__"):  # old API | ||||
|             on_match = patterns | ||||
|             patterns = _patterns | ||||
|         for pattern in patterns: | ||||
|             if len(pattern) == 0: | ||||
|                 raise ValueError(Errors.E012.format(key=key)) | ||||
|             self.validateInput(pattern,key) | ||||
|             self.validate_input(pattern,key) | ||||
|         key = self._normalize_key(key) | ||||
|         _patterns = [] | ||||
|         for pattern in patterns: | ||||
|  | @ -187,8 +193,7 @@ cdef class DependencyMatcher: | |||
|         key (string or int): The key to check. | ||||
|         RETURNS (bool): Whether the matcher has the rule. | ||||
|         """ | ||||
|         key = self._normalize_key(key) | ||||
|         return key in self._patterns | ||||
|         return self._normalize_key(key) in self._patterns | ||||
| 
 | ||||
|     def get(self, key, default=None): | ||||
|         """Retrieve the pattern stored for a key. | ||||
|  | @ -202,6 +207,13 @@ cdef class DependencyMatcher: | |||
|         return (self._callbacks[key], self._patterns[key]) | ||||
| 
 | ||||
|     def __call__(self, Doc doc): | ||||
|         """Find all token sequences matching the supplied pattern. | ||||
| 
 | ||||
|         doclike (Doc or Span): The document to match over. | ||||
|         RETURNS (list): A list of `(key, start, end)` tuples, | ||||
|             describing the matches. A match tuple describes a span | ||||
|             `doc[start:end]`. The `label_id` and `key` are both integers. | ||||
|         """ | ||||
|         matched_key_trees = [] | ||||
|         matches = self.token_matcher(doc) | ||||
|         for key in list(self._patterns.keys()): | ||||
|  | @ -241,25 +253,25 @@ cdef class DependencyMatcher: | |||
|                     on_match(self, doc, i, matched_key_trees) | ||||
|         return matched_key_trees | ||||
| 
 | ||||
|     def recurse(self,tree,id_to_position,_node_operator_map,int patternLength,visitedNodes,matched_trees): | ||||
|     def recurse(self,tree,id_to_position,_node_operator_map,int patternLength,visited_nodes,matched_trees): | ||||
|         cdef bool isValid; | ||||
|         if(patternLength == len(id_to_position.keys())): | ||||
|             isValid = True | ||||
|             for node in range(patternLength): | ||||
|                 if(node in tree): | ||||
|                     for idx, (relop,nbor) in enumerate(tree[node]): | ||||
|                         computed_nbors = numpy.asarray(_node_operator_map[visitedNodes[node]][relop]) | ||||
|                         computed_nbors = numpy.asarray(_node_operator_map[visited_nodes[node]][relop]) | ||||
|                         isNbor = False | ||||
|                         for computed_nbor in computed_nbors: | ||||
|                             if(computed_nbor.i == visitedNodes[nbor]): | ||||
|                             if(computed_nbor.i == visited_nodes[nbor]): | ||||
|                                 isNbor = True | ||||
|                         isValid = isValid & isNbor | ||||
|             if(isValid): | ||||
|                 matched_trees.append(visitedNodes) | ||||
|                 matched_trees.append(visited_nodes) | ||||
|             return | ||||
|         allPatternNodes = numpy.asarray(id_to_position[patternLength]) | ||||
|         for patternNode in allPatternNodes: | ||||
|             self.recurse(tree,id_to_position,_node_operator_map,patternLength+1,visitedNodes+[patternNode],matched_trees) | ||||
|             self.recurse(tree,id_to_position,_node_operator_map,patternLength+1,visited_nodes+[patternNode],matched_trees) | ||||
| 
 | ||||
|     # Given a node and an edge operator, to return the list of nodes | ||||
|     # from the doc that belong to node+operator. This is used to store | ||||
|  |  | |||
|  | @ -70,7 +70,7 @@ cdef class Matcher: | |||
|         key (str): The match ID. | ||||
|         RETURNS (bool): Whether the matcher contains rules for this match ID. | ||||
|         """ | ||||
|         return self._normalize_key(key) in self._patterns | ||||
|         return self.has_key(key) | ||||
| 
 | ||||
|     def add(self, key, patterns, *, on_match=None, greedy: str=None): | ||||
|         """Add a match-rule to the matcher. A match-rule consists of: an ID | ||||
|  | @ -162,8 +162,7 @@ cdef class Matcher: | |||
|         key (string or int): The key to check. | ||||
|         RETURNS (bool): Whether the matcher has the rule. | ||||
|         """ | ||||
|         key = self._normalize_key(key) | ||||
|         return key in self._patterns | ||||
|         return self._normalize_key(key) in self._patterns | ||||
| 
 | ||||
|     def get(self, key, default=None): | ||||
|         """Retrieve the pattern stored for a key. | ||||
|  | @ -179,7 +178,7 @@ cdef class Matcher: | |||
|     def pipe(self, docs, batch_size=1000, return_matches=False, as_tuples=False): | ||||
|         """Match a stream of documents, yielding them in turn. | ||||
| 
 | ||||
|         docs (iterable): A stream of documents. | ||||
|         docs (Iterable[Union[Doc, Span]]): A stream of documents or spans. | ||||
|         batch_size (int): Number of documents to accumulate into a working set. | ||||
|         return_matches (bool): Yield the match lists along with the docs, making | ||||
|             results (doc, matches) tuples. | ||||
|  |  | |||
|  | @ -75,8 +75,8 @@ class Morphologizer(Tagger): | |||
|         model (thinc.api.Model): The Thinc Model powering the pipeline component. | ||||
|         name (str): The component instance name, used to add entries to the | ||||
|             losses during training. | ||||
|         labels_morph (dict): TODO: | ||||
|         labels_pos (dict): TODO: | ||||
|         labels_morph (dict): Mapping of morph + POS tags to morph labels. | ||||
|         labels_pos (dict): Mapping of morph + POS tags to POS tags. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/morphologizer#init | ||||
|         """ | ||||
|  |  | |||
|  | @ -601,9 +601,7 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [overrides | |||
| 
 | ||||
| ## Pretrain {#pretrain new="2.1" tag="experimental"} | ||||
| 
 | ||||
| <!-- TODO: document new pretrain command and link to new pretraining docs --> | ||||
| 
 | ||||
| Pre-train the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline | ||||
| Pretrain the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline | ||||
| components on [raw text](/api/data-formats#pretrain), using an approximate | ||||
| language-modeling objective. Specifically, we load pretrained vectors, and train | ||||
| a component like a CNN, BiLSTM, etc to predict vectors which match the | ||||
|  | @ -611,7 +609,8 @@ pretrained ones. The weights are saved to a directory after each epoch. You can | |||
| then include a **path to one of these pretrained weights files** in your | ||||
| [training config](/usage/training#config) as the `init_tok2vec` setting when you | ||||
| train your model. This technique may be especially helpful if you have little | ||||
| labelled data. | ||||
| labelled data. See the usage docs on [pretraining](/usage/training#pretraining) | ||||
| for more info. | ||||
| 
 | ||||
| <Infobox title="Changed in v3.0" variant="warning"> | ||||
| 
 | ||||
|  | @ -634,8 +633,8 @@ $ python -m spacy pretrain [texts_loc] [output_dir] [config_path] | |||
| | `output_dir`            | positional | Directory to write models to on each epoch.                                                                                                                                  | | ||||
| | `config_path`           | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters.                                                                        | | ||||
| | `--code`, `-c`          | option     | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures.                         | | ||||
| | `--resume-path`, `-r`   | option     | TODO:                                                                                                                                                                        | | ||||
| | `--epoch-resume`, `-er` | option     | TODO:                                                                                                                                                                        | | ||||
| | `--resume-path`, `-r`   | option     | Path to pretrained weights from which to resume pretraining.                                                                                                                 | | ||||
| | `--epoch-resume`, `-er` | option     | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files.                                                      | | ||||
| | `--help`, `-h`          | flag       | Show help message and available arguments.                                                                                                                                   | | ||||
| | overrides               |            | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`.                | | ||||
| | **CREATES**             | weights    | The pretrained weights that can be used to initialize `spacy train`.                                                                                                         | | ||||
|  |  | |||
|  | @ -20,9 +20,9 @@ Config files define the training process and model pipeline and can be passed to | |||
| [`spacy train`](/api/cli#train). They use | ||||
| [Thinc's configuration system](https://thinc.ai/docs/usage-config) under the | ||||
| hood. For details on how to use training configs, see the | ||||
| [usage documentation](/usage/training#config). | ||||
| 
 | ||||
| <!-- TODO: add details on getting started and init config --> | ||||
| [usage documentation](/usage/training#config). To get started with a blank | ||||
| config or fill a partial config with all defaults, you can use the | ||||
| [`init config`](/api/cli#init-config) command. | ||||
| 
 | ||||
| > #### What does the @ mean? | ||||
| > | ||||
|  | @ -52,8 +52,6 @@ your config and check that it's valid, you can run the | |||
| 
 | ||||
| </Infobox> | ||||
| 
 | ||||
| <!-- TODO: once we know how we want to implement "starter config" workflow or outputting a full default config for the user, update this section with the command --> | ||||
| 
 | ||||
| ### nlp {#config-nlp tag="section"} | ||||
| 
 | ||||
| > #### Example | ||||
|  | @ -154,8 +152,6 @@ This section is optional and defines settings and controls for | |||
| [language model pretraining](/usage/training#pretraining). It's used when you | ||||
| run [`spacy pretrain`](/api/cli#pretrain). | ||||
| 
 | ||||
| <!-- TODO: complete --> | ||||
| 
 | ||||
| | Name                         | Type                                                | Description                                                                   | Default                                             | | ||||
| | ---------------------------- | --------------------------------------------------- | ----------------------------------------------------------------------------- | --------------------------------------------------- | | ||||
| | `max_epochs`                 | int                                                 | Maximum number of epochs.                                                     | `1000`                                              | | ||||
|  |  | |||
|  | @ -5,4 +5,194 @@ tag: class | |||
| source: spacy/matcher/dependencymatcher.pyx | ||||
| --- | ||||
| 
 | ||||
| TODO: write | ||||
| The `DependencyMatcher` follows the same API as the [`Matcher`](/api/matcher) | ||||
| and [`PhraseMatcher`](/api/phrasematcher) and lets you match on dependency trees | ||||
| using the | ||||
| [Semgrex syntax](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html). | ||||
| It requires a pretrained [`DependencyParser`](/api/parser) or other component | ||||
| that sets the `Token.dep` attribute. | ||||
| 
 | ||||
| ## Pattern format {#patterns} | ||||
| 
 | ||||
| > ```json | ||||
| > ### Example | ||||
| > [ | ||||
| >   { | ||||
| >     "SPEC": {"NODE_NAME": "founded"}, | ||||
| >     "PATTERN": {"ORTH": "founded"} | ||||
| >   }, | ||||
| >   { | ||||
| >     "SPEC": { | ||||
| >       "NODE_NAME": "founder", | ||||
| >       "NBOR_RELOP": ">", | ||||
| >       "NBOR_NAME": "founded" | ||||
| >   }, | ||||
| >     "PATTERN": {"DEP": "nsubj"} | ||||
| >   }, | ||||
| >   { | ||||
| >     "SPEC": { | ||||
| >       "NODE_NAME": "object", | ||||
| >       "NBOR_RELOP": ">", | ||||
| >       "NBOR_NAME": "founded" | ||||
| >   }, | ||||
| >     "PATTERN": {"DEP": "dobj"} | ||||
| >   } | ||||
| > ] | ||||
| > ``` | ||||
| 
 | ||||
| A pattern added to the `DependencyMatcher` consists of a list of dictionaries, | ||||
| with each dictionary describing a node to match. Each pattern should have the | ||||
| following top-level keys: | ||||
| 
 | ||||
| | Name      | Type | Description                                                                                                                 | | ||||
| | --------- | ---- | --------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `PATTERN` | dict | The token attributes to match in the same format as patterns provided to the regular token-based [`Matcher`](/api/matcher). | | ||||
| | `SPEC`    | dict | The relationships of the nodes in the subtree that should be matched.                                                       | | ||||
| 
 | ||||
| The `SPEC` includes the following fields: | ||||
| 
 | ||||
| | Name         | Type | Description                                                                                                                                                            | | ||||
| | ------------ | ---- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `NODE_NAME`  | str  | A unique name for this node to refer to it in other specs.                                                                                                             | | ||||
| | `NBOR_RELOP` | str  | A [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html) operator that describes how the two nodes are related. | | ||||
| | `NBOR_NAME`  | str  | The unique name of the node that this node is connected to.                                                                                                            | | ||||
| 
 | ||||
| ## DependencyMatcher.\_\_init\_\_ {#init tag="method"} | ||||
| 
 | ||||
| Create a rule-based `DependencyMatcher`. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > from spacy.matcher import DependencyMatcher | ||||
| > matcher = DependencyMatcher(nlp.vocab) | ||||
| > ``` | ||||
| 
 | ||||
| | Name    | Type    | Description                                                                                 | | ||||
| | ------- | ------- | ------------------------------------------------------------------------------------------- | | ||||
| | `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. | | ||||
| 
 | ||||
| ## DependencyMatcher.\_\call\_\_ {#call tag="method"} | ||||
| 
 | ||||
| Find all token sequences matching the supplied patterns on the `Doc` or `Span`. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > from spacy.matcher import Matcher | ||||
| > | ||||
| > matcher = Matcher(nlp.vocab) | ||||
| > pattern = [ | ||||
| >     {"SPEC": {"NODE_NAME": "founded"}, "PATTERN": {"ORTH": "founded"}}, | ||||
| >     {"SPEC": {"NODE_NAME": "founder", "NBOR_RELOP": ">", "NBOR_NAME": "founded"}, "PATTERN": {"DEP": "nsubj"}}, | ||||
| > ] | ||||
| > matcher.add("Founder", [pattern]) | ||||
| > doc = nlp("Bill Gates founded Microsoft.") | ||||
| > matches = matcher(doc) | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Type         | Description                                                                                                                                                              | | ||||
| | ----------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ||||
| | `doclike`   | `Doc`/`Span` | The `Doc` or `Span` to match over.                                                                                                                                       | | ||||
| | **RETURNS** | list         | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. | | ||||
| 
 | ||||
| ## DependencyMatcher.\_\_len\_\_ {#len tag="method"} | ||||
| 
 | ||||
| Get the number of rules (edges) added to the dependency matcher. Note that this | ||||
| only returns the number of rules (identical with the number of IDs), not the | ||||
| number of individual patterns. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > matcher = DependencyMatcher(nlp.vocab) | ||||
| > assert len(matcher) == 0 | ||||
| > pattern = [ | ||||
| >     {"SPEC": {"NODE_NAME": "founded"}, "PATTERN": {"ORTH": "founded"}}, | ||||
| >     {"SPEC": {"NODE_NAME": "START_ENTITY", "NBOR_RELOP": ">", "NBOR_NAME": "founded"}, "PATTERN": {"DEP": "nsubj"}}, | ||||
| > ] | ||||
| > matcher.add("Rule", [pattern]) | ||||
| > assert len(matcher) == 1 | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Type | Description          | | ||||
| | ----------- | ---- | -------------------- | | ||||
| | **RETURNS** | int  | The number of rules. | | ||||
| 
 | ||||
| ## DependencyMatcher.\_\_contains\_\_ {#contains tag="method"} | ||||
| 
 | ||||
| Check whether the matcher contains rules for a match ID. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > matcher = Matcher(nlp.vocab) | ||||
| > assert "Rule" not in matcher | ||||
| > matcher.add("Rule", [pattern]) | ||||
| > assert "Rule" in matcher | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Type | Description                                           | | ||||
| | ----------- | ---- | ----------------------------------------------------- | | ||||
| | `key`       | str  | The match ID.                                         | | ||||
| | **RETURNS** | bool | Whether the matcher contains rules for this match ID. | | ||||
| 
 | ||||
| ## DependencyMatcher.add {#add tag="method"} | ||||
| 
 | ||||
| Add a rule to the matcher, consisting of an ID key, one or more patterns, and an | ||||
| optional callback function to act on the matches. The callback function will | ||||
| receive the arguments `matcher`, `doc`, `i` and `matches`. If a pattern already | ||||
| exists for the given ID, the patterns will be extended. An `on_match` callback | ||||
| will be overwritten. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > def on_match(matcher, doc, id, matches): | ||||
| >     print('Matched!', matches) | ||||
| > | ||||
| > matcher = Matcher(nlp.vocab) | ||||
| > matcher.add("TEST_PATTERNS", patterns) | ||||
| > ``` | ||||
| 
 | ||||
| | Name           | Type               | Description                                                                                   | | ||||
| | -------------- | ------------------ | --------------------------------------------------------------------------------------------- | | ||||
| | `match_id`     | str                | An ID for the thing you're matching.                                                          | | ||||
| | `patterns`     | list               | Match pattern. A pattern consists of a list of dicts, where each dict describes a token.      | | ||||
| | _keyword-only_ |                    |                                                                                               | | ||||
| | `on_match`     | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | | ||||
| 
 | ||||
| ## DependencyMatcher.remove {#remove tag="method"} | ||||
| 
 | ||||
| Remove a rule from the matcher. A `KeyError` is raised if the match ID does not | ||||
| exist. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > matcher.add("Rule", [pattern]]) | ||||
| > assert "Rule" in matcher | ||||
| > matcher.remove("Rule") | ||||
| > assert "Rule" not in matcher | ||||
| > ``` | ||||
| 
 | ||||
| | Name  | Type | Description               | | ||||
| | ----- | ---- | ------------------------- | | ||||
| | `key` | str  | The ID of the match rule. | | ||||
| 
 | ||||
| ## DependencyMatcher.get {#get tag="method"} | ||||
| 
 | ||||
| Retrieve the pattern stored for a key. Returns the rule as an | ||||
| `(on_match, patterns)` tuple containing the callback and available patterns. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > matcher.add("Rule", [pattern], on_match=on_match) | ||||
| > on_match, patterns = matcher.get("Rule") | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Type  | Description                                   | | ||||
| | ----------- | ----- | --------------------------------------------- | | ||||
| | `key`       | str   | The ID of the match rule.                     | | ||||
| | **RETURNS** | tuple | The rule, as an `(on_match, patterns)` tuple. | | ||||
|  |  | |||
|  | @ -5,6 +5,82 @@ tag: class | |||
| source: spacy/matcher/matcher.pyx | ||||
| --- | ||||
| 
 | ||||
| The `Matcher` lets you find words and phrases using rules describing their token | ||||
| attributes. Rules can refer to token annotations (like the text or | ||||
| part-of-speech tags), as well as lexical attributes like `Token.is_punct`. | ||||
| Applying the matcher to a [`Doc`](/api/doc) gives you access to the matched | ||||
| tokens in context. For in-depth examples and workflows for combining rules and | ||||
| statistical models, see the [usage guide](/usage/rule-based-matching) on | ||||
| rule-based matching. | ||||
| 
 | ||||
| ## Pattern format {#patterns} | ||||
| 
 | ||||
| > ```json | ||||
| > ### Example | ||||
| > [ | ||||
| >   {"LOWER": "i"}, | ||||
| >   {"LEMMA": {"IN": ["like", "love"]}}, | ||||
| >   {"POS": "NOUN", "OP": "+"} | ||||
| > ] | ||||
| > ``` | ||||
| 
 | ||||
| A pattern added to the `Matcher` consists of a list of dictionaries. Each | ||||
| dictionary describes **one token** and its attributes. The available token | ||||
| pattern keys correspond to a number of | ||||
| [`Token` attributes](/api/token#attributes). The supported attributes for | ||||
| rule-based matching are: | ||||
| 
 | ||||
| | Attribute                              | Type |  Description                                                                                           | | ||||
| | -------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------ | | ||||
| | `ORTH`                                 | str  | The exact verbatim text of a token.                                                                    | | ||||
| | `TEXT` <Tag variant="new">2.1</Tag>    | str  | The exact verbatim text of a token.                                                                    | | ||||
| | `LOWER`                                | str  | The lowercase form of the token text.                                                                  | | ||||
| |  `LENGTH`                              | int  | The length of the token text.                                                                          | | ||||
| |  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`    | bool | Token text consists of alphabetic characters, ASCII characters, digits.                                | | ||||
| |  `IS_LOWER`, `IS_UPPER`, `IS_TITLE`    | bool | Token text is in lowercase, uppercase, titlecase.                                                      | | ||||
| |  `IS_PUNCT`, `IS_SPACE`, `IS_STOP`     | bool | Token is punctuation, whitespace, stop word.                                                           | | ||||
| |  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`  | bool | Token text resembles a number, URL, email.                                                             | | ||||
| |  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | str  | The token's simple and extended part-of-speech tag, dependency label, lemma, shape.                    | | ||||
| | `ENT_TYPE`                             | str  | The token's entity label.                                                                              | | ||||
| | `_` <Tag variant="new">2.1</Tag>       | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). | | ||||
| | `OP`                                   | str  | Operator or quantifier to determine how often to match a token pattern.                                | | ||||
| 
 | ||||
| Operators and quantifiers define **how often** a token pattern should be | ||||
| matched: | ||||
| 
 | ||||
| > ```json | ||||
| > ### Example | ||||
| > [ | ||||
| >   {"POS": "ADJ", "OP": "*"}, | ||||
| >   {"POS": "NOUN", "OP": "+"} | ||||
| > ] | ||||
| > ``` | ||||
| 
 | ||||
| | OP  | Description                                                      | | ||||
| | --- | ---------------------------------------------------------------- | | ||||
| | `!` | Negate the pattern, by requiring it to match exactly 0 times.    | | ||||
| | `?` | Make the pattern optional, by allowing it to match 0 or 1 times. | | ||||
| | `+` | Require the pattern to match 1 or more times.                    | | ||||
| | `*` | Allow the pattern to match zero or more times.                   | | ||||
| 
 | ||||
| Token patterns can also map to a **dictionary of properties** instead of a | ||||
| single value to indicate whether the expected value is a member of a list or how | ||||
| it compares to another value. | ||||
| 
 | ||||
| > ```json | ||||
| > ### Example | ||||
| > [ | ||||
| >   {"LEMMA": {"IN": ["like", "love", "enjoy"]}}, | ||||
| >   {"POS": "PROPN", "LENGTH": {">=": 10}}, | ||||
| > ] | ||||
| > ``` | ||||
| 
 | ||||
| | Attribute                  | Type       | Description                                                                       | | ||||
| | -------------------------- | ---------- | --------------------------------------------------------------------------------- | | ||||
| | `IN`                       | any        | Attribute value is member of a list.                                              | | ||||
| | `NOT_IN`                   | any        | Attribute value is _not_ member of a list.                                        | | ||||
| | `==`, `>=`, `<=`, `>`, `<` | int, float | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. | | ||||
| 
 | ||||
| ## Matcher.\_\_init\_\_ {#init tag="method"} | ||||
| 
 | ||||
| Create the rule-based `Matcher`. If `validate=True` is set, all patterns added | ||||
|  | @ -60,7 +136,7 @@ Match a stream of documents, yielding them in turn. | |||
| 
 | ||||
| | Name                                          | Type     | Description                                                                                                                                                                                                                | | ||||
| | --------------------------------------------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `docs`                                        | iterable | A stream of documents.                                                                                                                                                                                                     | | ||||
| | `docs`                                        | iterable | A stream of documents or spans.                                                                                                                                                                                            | | ||||
| | `batch_size`                                  | int      | The number of documents to accumulate into a working set.                                                                                                                                                                  | | ||||
| | `return_matches` <Tag variant="new">2.1</Tag> | bool     | Yield the match lists along with the docs, making results `(doc, matches)` tuples.                                                                                                                                         | | ||||
| | `as_tuples`                                   | bool     | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. | | ||||
|  | @ -105,11 +181,11 @@ Check whether the matcher contains rules for a match ID. | |||
| 
 | ||||
| ## Matcher.add {#add tag="method" new="2"} | ||||
| 
 | ||||
| Add a rule to the matcher, consisting of an ID key, one or more patterns, and a | ||||
| callback function to act on the matches. The callback function will receive the | ||||
| arguments `matcher`, `doc`, `i` and `matches`. If a pattern already exists for | ||||
| the given ID, the patterns will be extended. An `on_match` callback will be | ||||
| overwritten. | ||||
| Add a rule to the matcher, consisting of an ID key, one or more patterns, and an | ||||
| optional callback function to act on the matches. The callback function will | ||||
| receive the arguments `matcher`, `doc`, `i` and `matches`. If a pattern already | ||||
| exists for the given ID, the patterns will be extended. An `on_match` callback | ||||
| will be overwritten. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
|  | @ -142,11 +218,12 @@ patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]] | |||
| </Infobox> | ||||
| 
 | ||||
| | Name                                | Type               | Description                                                                                   | | ||||
| | -------------- | ------------------ | --------------------------------------------------------------------------------------------- | | ||||
| | ----------------------------------- | ------------------ | --------------------------------------------------------------------------------------------- | | ||||
| | `match_id`                          | str                | An ID for the thing you're matching.                                                          | | ||||
| | `patterns`     | list               | Match pattern. A pattern consists of a list of dicts, where each dict describes a token.      | | ||||
| | `patterns`                          | `List[List[dict]]` | Match pattern. A pattern consists of a list of dicts, where each dict describes a token.      | | ||||
| | _keyword-only_                      |                    |                                                                                               | | ||||
| | `on_match`     | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | | ||||
| | `on_match`                          | callable / `None`  | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | | ||||
| | `greedy` <Tag variant="new">3</Tag> | str                | Optional filter for greedy matches. Can either be `"FIRST"` or `"LONGEST"`.                   | | ||||
| 
 | ||||
| ## Matcher.remove {#remove tag="method" new="2"} | ||||
| 
 | ||||
|  |  | |||
|  | @ -63,16 +63,14 @@ Create a new pipeline instance. In your application, you would normally use a | |||
| shortcut for this and instantiate the component using its string name and | ||||
| [`nlp.add_pipe`](/api/language#add_pipe). | ||||
| 
 | ||||
| <!-- TODO: finish API docs --> | ||||
| 
 | ||||
| | Name           | Type    | Description                                                                                 | | ||||
| | -------------- | ------- | ------------------------------------------------------------------------------------------- | | ||||
| | `vocab`        | `Vocab` | The shared vocabulary.                                                                      | | ||||
| | `model`        | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component.             | | ||||
| | `name`         | str     | String name of the component instance. Used to add entries to the `losses` during training. | | ||||
| | _keyword-only_ |         |                                                                                             | | ||||
| | `labels_morph` | dict    |                                                                                             | | ||||
| | `labels_pos`   | dict    |                                                                                             | | ||||
| | `labels_morph` | dict    | Mapping of morph + POS tags to morph labels.                                                | | ||||
| | `labels_pos`   | dict    | Mapping of morph + POS tags to POS tags.                                                    | | ||||
| 
 | ||||
| ## Morphologizer.\_\_call\_\_ {#call tag="method"} | ||||
| 
 | ||||
|  |  | |||
|  | @ -9,7 +9,8 @@ new: 2 | |||
| The `PhraseMatcher` lets you efficiently match large terminology lists. While | ||||
| the [`Matcher`](/api/matcher) lets you match sequences based on lists of token | ||||
| descriptions, the `PhraseMatcher` accepts match patterns in the form of `Doc` | ||||
| objects. | ||||
| objects. See the [usage guide](/usage/rule-based-matching#phrasematcher) for | ||||
| examples. | ||||
| 
 | ||||
| ## PhraseMatcher.\_\_init\_\_ {#init tag="method"} | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user