Update docs and fix consistency

This commit is contained in:
Ines Montani 2020-08-09 22:31:52 +02:00
parent 7c6854d8d4
commit d5c78c7a34
10 changed files with 326 additions and 54 deletions

View File

@ -35,7 +35,7 @@ def pretrain_cli(
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False), config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"), resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."), epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
# fmt: on # fmt: on
): ):

View File

@ -68,11 +68,11 @@ cdef class DependencyMatcher:
key (str): The match ID. key (str): The match ID.
RETURNS (bool): Whether the matcher contains rules for this match ID. RETURNS (bool): Whether the matcher contains rules for this match ID.
""" """
return self._normalize_key(key) in self._patterns return self.has_key(key)
def validateInput(self, pattern, key): def validate_input(self, pattern, key):
idx = 0 idx = 0
visitedNodes = {} visited_nodes = {}
for relation in pattern: for relation in pattern:
if "PATTERN" not in relation or "SPEC" not in relation: if "PATTERN" not in relation or "SPEC" not in relation:
raise ValueError(Errors.E098.format(key=key)) raise ValueError(Errors.E098.format(key=key))
@ -83,7 +83,7 @@ cdef class DependencyMatcher:
and "NBOR_NAME" not in relation["SPEC"] and "NBOR_NAME" not in relation["SPEC"]
): ):
raise ValueError(Errors.E099.format(key=key)) raise ValueError(Errors.E099.format(key=key))
visitedNodes[relation["SPEC"]["NODE_NAME"]] = True visited_nodes[relation["SPEC"]["NODE_NAME"]] = True
else: else:
if not( if not(
"NODE_NAME" in relation["SPEC"] "NODE_NAME" in relation["SPEC"]
@ -92,22 +92,28 @@ cdef class DependencyMatcher:
): ):
raise ValueError(Errors.E100.format(key=key)) raise ValueError(Errors.E100.format(key=key))
if ( if (
relation["SPEC"]["NODE_NAME"] in visitedNodes relation["SPEC"]["NODE_NAME"] in visited_nodes
or relation["SPEC"]["NBOR_NAME"] not in visitedNodes or relation["SPEC"]["NBOR_NAME"] not in visited_nodes
): ):
raise ValueError(Errors.E101.format(key=key)) raise ValueError(Errors.E101.format(key=key))
visitedNodes[relation["SPEC"]["NODE_NAME"]] = True visited_nodes[relation["SPEC"]["NODE_NAME"]] = True
visitedNodes[relation["SPEC"]["NBOR_NAME"]] = True visited_nodes[relation["SPEC"]["NBOR_NAME"]] = True
idx = idx + 1 idx = idx + 1
def add(self, key, patterns, *_patterns, on_match=None): def add(self, key, patterns, *_patterns, on_match=None):
"""Add a new matcher rule to the matcher.
key (str): The match ID.
patterns (list): The patterns to add for the given key.
on_match (callable): Optional callback executed on match.
"""
if patterns is None or hasattr(patterns, "__call__"): # old API if patterns is None or hasattr(patterns, "__call__"): # old API
on_match = patterns on_match = patterns
patterns = _patterns patterns = _patterns
for pattern in patterns: for pattern in patterns:
if len(pattern) == 0: if len(pattern) == 0:
raise ValueError(Errors.E012.format(key=key)) raise ValueError(Errors.E012.format(key=key))
self.validateInput(pattern,key) self.validate_input(pattern,key)
key = self._normalize_key(key) key = self._normalize_key(key)
_patterns = [] _patterns = []
for pattern in patterns: for pattern in patterns:
@ -187,8 +193,7 @@ cdef class DependencyMatcher:
key (string or int): The key to check. key (string or int): The key to check.
RETURNS (bool): Whether the matcher has the rule. RETURNS (bool): Whether the matcher has the rule.
""" """
key = self._normalize_key(key) return self._normalize_key(key) in self._patterns
return key in self._patterns
def get(self, key, default=None): def get(self, key, default=None):
"""Retrieve the pattern stored for a key. """Retrieve the pattern stored for a key.
@ -202,6 +207,13 @@ cdef class DependencyMatcher:
return (self._callbacks[key], self._patterns[key]) return (self._callbacks[key], self._patterns[key])
def __call__(self, Doc doc): def __call__(self, Doc doc):
"""Find all token sequences matching the supplied pattern.
doclike (Doc or Span): The document to match over.
RETURNS (list): A list of `(key, start, end)` tuples,
describing the matches. A match tuple describes a span
`doc[start:end]`. The `label_id` and `key` are both integers.
"""
matched_key_trees = [] matched_key_trees = []
matches = self.token_matcher(doc) matches = self.token_matcher(doc)
for key in list(self._patterns.keys()): for key in list(self._patterns.keys()):
@ -241,25 +253,25 @@ cdef class DependencyMatcher:
on_match(self, doc, i, matched_key_trees) on_match(self, doc, i, matched_key_trees)
return matched_key_trees return matched_key_trees
def recurse(self,tree,id_to_position,_node_operator_map,int patternLength,visitedNodes,matched_trees): def recurse(self,tree,id_to_position,_node_operator_map,int patternLength,visited_nodes,matched_trees):
cdef bool isValid; cdef bool isValid;
if(patternLength == len(id_to_position.keys())): if(patternLength == len(id_to_position.keys())):
isValid = True isValid = True
for node in range(patternLength): for node in range(patternLength):
if(node in tree): if(node in tree):
for idx, (relop,nbor) in enumerate(tree[node]): for idx, (relop,nbor) in enumerate(tree[node]):
computed_nbors = numpy.asarray(_node_operator_map[visitedNodes[node]][relop]) computed_nbors = numpy.asarray(_node_operator_map[visited_nodes[node]][relop])
isNbor = False isNbor = False
for computed_nbor in computed_nbors: for computed_nbor in computed_nbors:
if(computed_nbor.i == visitedNodes[nbor]): if(computed_nbor.i == visited_nodes[nbor]):
isNbor = True isNbor = True
isValid = isValid & isNbor isValid = isValid & isNbor
if(isValid): if(isValid):
matched_trees.append(visitedNodes) matched_trees.append(visited_nodes)
return return
allPatternNodes = numpy.asarray(id_to_position[patternLength]) allPatternNodes = numpy.asarray(id_to_position[patternLength])
for patternNode in allPatternNodes: for patternNode in allPatternNodes:
self.recurse(tree,id_to_position,_node_operator_map,patternLength+1,visitedNodes+[patternNode],matched_trees) self.recurse(tree,id_to_position,_node_operator_map,patternLength+1,visited_nodes+[patternNode],matched_trees)
# Given a node and an edge operator, to return the list of nodes # Given a node and an edge operator, to return the list of nodes
# from the doc that belong to node+operator. This is used to store # from the doc that belong to node+operator. This is used to store

View File

@ -70,7 +70,7 @@ cdef class Matcher:
key (str): The match ID. key (str): The match ID.
RETURNS (bool): Whether the matcher contains rules for this match ID. RETURNS (bool): Whether the matcher contains rules for this match ID.
""" """
return self._normalize_key(key) in self._patterns return self.has_key(key)
def add(self, key, patterns, *, on_match=None, greedy: str=None): def add(self, key, patterns, *, on_match=None, greedy: str=None):
"""Add a match-rule to the matcher. A match-rule consists of: an ID """Add a match-rule to the matcher. A match-rule consists of: an ID
@ -162,8 +162,7 @@ cdef class Matcher:
key (string or int): The key to check. key (string or int): The key to check.
RETURNS (bool): Whether the matcher has the rule. RETURNS (bool): Whether the matcher has the rule.
""" """
key = self._normalize_key(key) return self._normalize_key(key) in self._patterns
return key in self._patterns
def get(self, key, default=None): def get(self, key, default=None):
"""Retrieve the pattern stored for a key. """Retrieve the pattern stored for a key.
@ -179,7 +178,7 @@ cdef class Matcher:
def pipe(self, docs, batch_size=1000, return_matches=False, as_tuples=False): def pipe(self, docs, batch_size=1000, return_matches=False, as_tuples=False):
"""Match a stream of documents, yielding them in turn. """Match a stream of documents, yielding them in turn.
docs (iterable): A stream of documents. docs (Iterable[Union[Doc, Span]]): A stream of documents or spans.
batch_size (int): Number of documents to accumulate into a working set. batch_size (int): Number of documents to accumulate into a working set.
return_matches (bool): Yield the match lists along with the docs, making return_matches (bool): Yield the match lists along with the docs, making
results (doc, matches) tuples. results (doc, matches) tuples.

View File

@ -75,8 +75,8 @@ class Morphologizer(Tagger):
model (thinc.api.Model): The Thinc Model powering the pipeline component. model (thinc.api.Model): The Thinc Model powering the pipeline component.
name (str): The component instance name, used to add entries to the name (str): The component instance name, used to add entries to the
losses during training. losses during training.
labels_morph (dict): TODO: labels_morph (dict): Mapping of morph + POS tags to morph labels.
labels_pos (dict): TODO: labels_pos (dict): Mapping of morph + POS tags to POS tags.
DOCS: https://spacy.io/api/morphologizer#init DOCS: https://spacy.io/api/morphologizer#init
""" """

View File

@ -601,9 +601,7 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [overrides
## Pretrain {#pretrain new="2.1" tag="experimental"} ## Pretrain {#pretrain new="2.1" tag="experimental"}
<!-- TODO: document new pretrain command and link to new pretraining docs --> Pretrain the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline
Pre-train the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline
components on [raw text](/api/data-formats#pretrain), using an approximate components on [raw text](/api/data-formats#pretrain), using an approximate
language-modeling objective. Specifically, we load pretrained vectors, and train language-modeling objective. Specifically, we load pretrained vectors, and train
a component like a CNN, BiLSTM, etc to predict vectors which match the a component like a CNN, BiLSTM, etc to predict vectors which match the
@ -611,7 +609,8 @@ pretrained ones. The weights are saved to a directory after each epoch. You can
then include a **path to one of these pretrained weights files** in your then include a **path to one of these pretrained weights files** in your
[training config](/usage/training#config) as the `init_tok2vec` setting when you [training config](/usage/training#config) as the `init_tok2vec` setting when you
train your model. This technique may be especially helpful if you have little train your model. This technique may be especially helpful if you have little
labelled data. labelled data. See the usage docs on [pretraining](/usage/training#pretraining)
for more info.
<Infobox title="Changed in v3.0" variant="warning"> <Infobox title="Changed in v3.0" variant="warning">
@ -634,8 +633,8 @@ $ python -m spacy pretrain [texts_loc] [output_dir] [config_path]
| `output_dir` | positional | Directory to write models to on each epoch. | | `output_dir` | positional | Directory to write models to on each epoch. |
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. | | `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. | | `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
| `--resume-path`, `-r` | option | TODO: | | `--resume-path`, `-r` | option | Path to pretrained weights from which to resume pretraining. |
| `--epoch-resume`, `-er` | option | TODO: | | `--epoch-resume`, `-er` | option | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. |
| `--help`, `-h` | flag | Show help message and available arguments. | | `--help`, `-h` | flag | Show help message and available arguments. |
| overrides | | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`. | | overrides | | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`. |
| **CREATES** | weights | The pretrained weights that can be used to initialize `spacy train`. | | **CREATES** | weights | The pretrained weights that can be used to initialize `spacy train`. |

View File

@ -20,9 +20,9 @@ Config files define the training process and model pipeline and can be passed to
[`spacy train`](/api/cli#train). They use [`spacy train`](/api/cli#train). They use
[Thinc's configuration system](https://thinc.ai/docs/usage-config) under the [Thinc's configuration system](https://thinc.ai/docs/usage-config) under the
hood. For details on how to use training configs, see the hood. For details on how to use training configs, see the
[usage documentation](/usage/training#config). [usage documentation](/usage/training#config). To get started with a blank
config or fill a partial config with all defaults, you can use the
<!-- TODO: add details on getting started and init config --> [`init config`](/api/cli#init-config) command.
> #### What does the @ mean? > #### What does the @ mean?
> >
@ -52,8 +52,6 @@ your config and check that it's valid, you can run the
</Infobox> </Infobox>
<!-- TODO: once we know how we want to implement "starter config" workflow or outputting a full default config for the user, update this section with the command -->
### nlp {#config-nlp tag="section"} ### nlp {#config-nlp tag="section"}
> #### Example > #### Example
@ -154,8 +152,6 @@ This section is optional and defines settings and controls for
[language model pretraining](/usage/training#pretraining). It's used when you [language model pretraining](/usage/training#pretraining). It's used when you
run [`spacy pretrain`](/api/cli#pretrain). run [`spacy pretrain`](/api/cli#pretrain).
<!-- TODO: complete -->
| Name | Type | Description | Default | | Name | Type | Description | Default |
| ---------------------------- | --------------------------------------------------- | ----------------------------------------------------------------------------- | --------------------------------------------------- | | ---------------------------- | --------------------------------------------------- | ----------------------------------------------------------------------------- | --------------------------------------------------- |
| `max_epochs` | int | Maximum number of epochs. | `1000` | | `max_epochs` | int | Maximum number of epochs. | `1000` |

View File

@ -5,4 +5,194 @@ tag: class
source: spacy/matcher/dependencymatcher.pyx source: spacy/matcher/dependencymatcher.pyx
--- ---
TODO: write The `DependencyMatcher` follows the same API as the [`Matcher`](/api/matcher)
and [`PhraseMatcher`](/api/phrasematcher) and lets you match on dependency trees
using the
[Semgrex syntax](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html).
It requires a pretrained [`DependencyParser`](/api/parser) or other component
that sets the `Token.dep` attribute.
## Pattern format {#patterns}
> ```json
> ### Example
> [
> {
> "SPEC": {"NODE_NAME": "founded"},
> "PATTERN": {"ORTH": "founded"}
> },
> {
> "SPEC": {
> "NODE_NAME": "founder",
> "NBOR_RELOP": ">",
> "NBOR_NAME": "founded"
> },
> "PATTERN": {"DEP": "nsubj"}
> },
> {
> "SPEC": {
> "NODE_NAME": "object",
> "NBOR_RELOP": ">",
> "NBOR_NAME": "founded"
> },
> "PATTERN": {"DEP": "dobj"}
> }
> ]
> ```
A pattern added to the `DependencyMatcher` consists of a list of dictionaries,
with each dictionary describing a node to match. Each pattern should have the
following top-level keys:
| Name | Type | Description |
| --------- | ---- | --------------------------------------------------------------------------------------------------------------------------- |
| `PATTERN` | dict | The token attributes to match in the same format as patterns provided to the regular token-based [`Matcher`](/api/matcher). |
| `SPEC` | dict | The relationships of the nodes in the subtree that should be matched. |
The `SPEC` includes the following fields:
| Name | Type | Description |
| ------------ | ---- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `NODE_NAME` | str | A unique name for this node to refer to it in other specs. |
| `NBOR_RELOP` | str | A [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html) operator that describes how the two nodes are related. |
| `NBOR_NAME` | str | The unique name of the node that this node is connected to. |
## DependencyMatcher.\_\_init\_\_ {#init tag="method"}
Create a rule-based `DependencyMatcher`.
> #### Example
>
> ```python
> from spacy.matcher import DependencyMatcher
> matcher = DependencyMatcher(nlp.vocab)
> ```
| Name | Type | Description |
| ------- | ------- | ------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. |
## DependencyMatcher.\_\call\_\_ {#call tag="method"}
Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
> #### Example
>
> ```python
> from spacy.matcher import Matcher
>
> matcher = Matcher(nlp.vocab)
> pattern = [
> {"SPEC": {"NODE_NAME": "founded"}, "PATTERN": {"ORTH": "founded"}},
> {"SPEC": {"NODE_NAME": "founder", "NBOR_RELOP": ">", "NBOR_NAME": "founded"}, "PATTERN": {"DEP": "nsubj"}},
> ]
> matcher.add("Founder", [pattern])
> doc = nlp("Bill Gates founded Microsoft.")
> matches = matcher(doc)
> ```
| Name | Type | Description |
| ----------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `doclike` | `Doc`/`Span` | The `Doc` or `Span` to match over. |
| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. |
## DependencyMatcher.\_\_len\_\_ {#len tag="method"}
Get the number of rules (edges) added to the dependency matcher. Note that this
only returns the number of rules (identical with the number of IDs), not the
number of individual patterns.
> #### Example
>
> ```python
> matcher = DependencyMatcher(nlp.vocab)
> assert len(matcher) == 0
> pattern = [
> {"SPEC": {"NODE_NAME": "founded"}, "PATTERN": {"ORTH": "founded"}},
> {"SPEC": {"NODE_NAME": "START_ENTITY", "NBOR_RELOP": ">", "NBOR_NAME": "founded"}, "PATTERN": {"DEP": "nsubj"}},
> ]
> matcher.add("Rule", [pattern])
> assert len(matcher) == 1
> ```
| Name | Type | Description |
| ----------- | ---- | -------------------- |
| **RETURNS** | int | The number of rules. |
## DependencyMatcher.\_\_contains\_\_ {#contains tag="method"}
Check whether the matcher contains rules for a match ID.
> #### Example
>
> ```python
> matcher = Matcher(nlp.vocab)
> assert "Rule" not in matcher
> matcher.add("Rule", [pattern])
> assert "Rule" in matcher
> ```
| Name | Type | Description |
| ----------- | ---- | ----------------------------------------------------- |
| `key` | str | The match ID. |
| **RETURNS** | bool | Whether the matcher contains rules for this match ID. |
## DependencyMatcher.add {#add tag="method"}
Add a rule to the matcher, consisting of an ID key, one or more patterns, and an
optional callback function to act on the matches. The callback function will
receive the arguments `matcher`, `doc`, `i` and `matches`. If a pattern already
exists for the given ID, the patterns will be extended. An `on_match` callback
will be overwritten.
> #### Example
>
> ```python
> def on_match(matcher, doc, id, matches):
> print('Matched!', matches)
>
> matcher = Matcher(nlp.vocab)
> matcher.add("TEST_PATTERNS", patterns)
> ```
| Name | Type | Description |
| -------------- | ------------------ | --------------------------------------------------------------------------------------------- |
| `match_id` | str | An ID for the thing you're matching. |
| `patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. |
| _keyword-only_ | | |
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
## DependencyMatcher.remove {#remove tag="method"}
Remove a rule from the matcher. A `KeyError` is raised if the match ID does not
exist.
> #### Example
>
> ```python
> matcher.add("Rule", [pattern]])
> assert "Rule" in matcher
> matcher.remove("Rule")
> assert "Rule" not in matcher
> ```
| Name | Type | Description |
| ----- | ---- | ------------------------- |
| `key` | str | The ID of the match rule. |
## DependencyMatcher.get {#get tag="method"}
Retrieve the pattern stored for a key. Returns the rule as an
`(on_match, patterns)` tuple containing the callback and available patterns.
> #### Example
>
> ```python
> matcher.add("Rule", [pattern], on_match=on_match)
> on_match, patterns = matcher.get("Rule")
> ```
| Name | Type | Description |
| ----------- | ----- | --------------------------------------------- |
| `key` | str | The ID of the match rule. |
| **RETURNS** | tuple | The rule, as an `(on_match, patterns)` tuple. |

View File

@ -5,6 +5,82 @@ tag: class
source: spacy/matcher/matcher.pyx source: spacy/matcher/matcher.pyx
--- ---
The `Matcher` lets you find words and phrases using rules describing their token
attributes. Rules can refer to token annotations (like the text or
part-of-speech tags), as well as lexical attributes like `Token.is_punct`.
Applying the matcher to a [`Doc`](/api/doc) gives you access to the matched
tokens in context. For in-depth examples and workflows for combining rules and
statistical models, see the [usage guide](/usage/rule-based-matching) on
rule-based matching.
## Pattern format {#patterns}
> ```json
> ### Example
> [
> {"LOWER": "i"},
> {"LEMMA": {"IN": ["like", "love"]}},
> {"POS": "NOUN", "OP": "+"}
> ]
> ```
A pattern added to the `Matcher` consists of a list of dictionaries. Each
dictionary describes **one token** and its attributes. The available token
pattern keys correspond to a number of
[`Token` attributes](/api/token#attributes). The supported attributes for
rule-based matching are:
| Attribute | Type |  Description |
| -------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------ |
| `ORTH` | str | The exact verbatim text of a token. |
| `TEXT` <Tag variant="new">2.1</Tag> | str | The exact verbatim text of a token. |
| `LOWER` | str | The lowercase form of the token text. |
|  `LENGTH` | int | The length of the token text. |
|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. |
|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. |
|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. |
|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. |
|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | str | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. |
| `ENT_TYPE` | str | The token's entity label. |
| `_` <Tag variant="new">2.1</Tag> | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). |
| `OP` | str | Operator or quantifier to determine how often to match a token pattern. |
Operators and quantifiers define **how often** a token pattern should be
matched:
> ```json
> ### Example
> [
> {"POS": "ADJ", "OP": "*"},
> {"POS": "NOUN", "OP": "+"}
> ]
> ```
| OP | Description |
| --- | ---------------------------------------------------------------- |
| `!` | Negate the pattern, by requiring it to match exactly 0 times. |
| `?` | Make the pattern optional, by allowing it to match 0 or 1 times. |
| `+` | Require the pattern to match 1 or more times. |
| `*` | Allow the pattern to match zero or more times. |
Token patterns can also map to a **dictionary of properties** instead of a
single value to indicate whether the expected value is a member of a list or how
it compares to another value.
> ```json
> ### Example
> [
> {"LEMMA": {"IN": ["like", "love", "enjoy"]}},
> {"POS": "PROPN", "LENGTH": {">=": 10}},
> ]
> ```
| Attribute | Type | Description |
| -------------------------- | ---------- | --------------------------------------------------------------------------------- |
| `IN` | any | Attribute value is member of a list. |
| `NOT_IN` | any | Attribute value is _not_ member of a list. |
| `==`, `>=`, `<=`, `>`, `<` | int, float | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. |
## Matcher.\_\_init\_\_ {#init tag="method"} ## Matcher.\_\_init\_\_ {#init tag="method"}
Create the rule-based `Matcher`. If `validate=True` is set, all patterns added Create the rule-based `Matcher`. If `validate=True` is set, all patterns added
@ -60,7 +136,7 @@ Match a stream of documents, yielding them in turn.
| Name | Type | Description | | Name | Type | Description |
| --------------------------------------------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | --------------------------------------------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `docs` | iterable | A stream of documents. | | `docs` | iterable | A stream of documents or spans. |
| `batch_size` | int | The number of documents to accumulate into a working set. | | `batch_size` | int | The number of documents to accumulate into a working set. |
| `return_matches` <Tag variant="new">2.1</Tag> | bool | Yield the match lists along with the docs, making results `(doc, matches)` tuples. | | `return_matches` <Tag variant="new">2.1</Tag> | bool | Yield the match lists along with the docs, making results `(doc, matches)` tuples. |
| `as_tuples` | bool | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. | | `as_tuples` | bool | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. |
@ -105,11 +181,11 @@ Check whether the matcher contains rules for a match ID.
## Matcher.add {#add tag="method" new="2"} ## Matcher.add {#add tag="method" new="2"}
Add a rule to the matcher, consisting of an ID key, one or more patterns, and a Add a rule to the matcher, consisting of an ID key, one or more patterns, and an
callback function to act on the matches. The callback function will receive the optional callback function to act on the matches. The callback function will
arguments `matcher`, `doc`, `i` and `matches`. If a pattern already exists for receive the arguments `matcher`, `doc`, `i` and `matches`. If a pattern already
the given ID, the patterns will be extended. An `on_match` callback will be exists for the given ID, the patterns will be extended. An `on_match` callback
overwritten. will be overwritten.
> #### Example > #### Example
> >
@ -141,12 +217,13 @@ patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]]
</Infobox> </Infobox>
| Name | Type | Description | | Name | Type | Description |
| -------------- | ------------------ | --------------------------------------------------------------------------------------------- | | ----------------------------------- | ------------------ | --------------------------------------------------------------------------------------------- |
| `match_id` | str | An ID for the thing you're matching. | | `match_id` | str | An ID for the thing you're matching. |
| `patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. | | `patterns` | `List[List[dict]]` | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. |
| _keyword-only_ | | | | _keyword-only_ | | |
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | | `on_match` | callable / `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
| `greedy` <Tag variant="new">3</Tag> | str | Optional filter for greedy matches. Can either be `"FIRST"` or `"LONGEST"`. |
## Matcher.remove {#remove tag="method" new="2"} ## Matcher.remove {#remove tag="method" new="2"}

View File

@ -63,16 +63,14 @@ Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe). [`nlp.add_pipe`](/api/language#add_pipe).
<!-- TODO: finish API docs -->
| Name | Type | Description | | Name | Type | Description |
| -------------- | ------- | ------------------------------------------------------------------------------------------- | | -------------- | ------- | ------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The shared vocabulary. | | `vocab` | `Vocab` | The shared vocabulary. |
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | | `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | | `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
| _keyword-only_ | | | | _keyword-only_ | | |
| `labels_morph` | dict | | | `labels_morph` | dict | Mapping of morph + POS tags to morph labels. |
| `labels_pos` | dict | | | `labels_pos` | dict | Mapping of morph + POS tags to POS tags. |
## Morphologizer.\_\_call\_\_ {#call tag="method"} ## Morphologizer.\_\_call\_\_ {#call tag="method"}

View File

@ -9,7 +9,8 @@ new: 2
The `PhraseMatcher` lets you efficiently match large terminology lists. While The `PhraseMatcher` lets you efficiently match large terminology lists. While
the [`Matcher`](/api/matcher) lets you match sequences based on lists of token the [`Matcher`](/api/matcher) lets you match sequences based on lists of token
descriptions, the `PhraseMatcher` accepts match patterns in the form of `Doc` descriptions, the `PhraseMatcher` accepts match patterns in the form of `Doc`
objects. objects. See the [usage guide](/usage/rule-based-matching#phrasematcher) for
examples.
## PhraseMatcher.\_\_init\_\_ {#init tag="method"} ## PhraseMatcher.\_\_init\_\_ {#init tag="method"}