mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Update docs and fix consistency
This commit is contained in:
parent
7c6854d8d4
commit
d5c78c7a34
|
@ -35,7 +35,7 @@ def pretrain_cli(
|
|||
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
|
||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
||||
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."),
|
||||
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
# fmt: on
|
||||
):
|
||||
|
|
|
@ -68,11 +68,11 @@ cdef class DependencyMatcher:
|
|||
key (str): The match ID.
|
||||
RETURNS (bool): Whether the matcher contains rules for this match ID.
|
||||
"""
|
||||
return self._normalize_key(key) in self._patterns
|
||||
return self.has_key(key)
|
||||
|
||||
def validateInput(self, pattern, key):
|
||||
def validate_input(self, pattern, key):
|
||||
idx = 0
|
||||
visitedNodes = {}
|
||||
visited_nodes = {}
|
||||
for relation in pattern:
|
||||
if "PATTERN" not in relation or "SPEC" not in relation:
|
||||
raise ValueError(Errors.E098.format(key=key))
|
||||
|
@ -83,7 +83,7 @@ cdef class DependencyMatcher:
|
|||
and "NBOR_NAME" not in relation["SPEC"]
|
||||
):
|
||||
raise ValueError(Errors.E099.format(key=key))
|
||||
visitedNodes[relation["SPEC"]["NODE_NAME"]] = True
|
||||
visited_nodes[relation["SPEC"]["NODE_NAME"]] = True
|
||||
else:
|
||||
if not(
|
||||
"NODE_NAME" in relation["SPEC"]
|
||||
|
@ -92,22 +92,28 @@ cdef class DependencyMatcher:
|
|||
):
|
||||
raise ValueError(Errors.E100.format(key=key))
|
||||
if (
|
||||
relation["SPEC"]["NODE_NAME"] in visitedNodes
|
||||
or relation["SPEC"]["NBOR_NAME"] not in visitedNodes
|
||||
relation["SPEC"]["NODE_NAME"] in visited_nodes
|
||||
or relation["SPEC"]["NBOR_NAME"] not in visited_nodes
|
||||
):
|
||||
raise ValueError(Errors.E101.format(key=key))
|
||||
visitedNodes[relation["SPEC"]["NODE_NAME"]] = True
|
||||
visitedNodes[relation["SPEC"]["NBOR_NAME"]] = True
|
||||
visited_nodes[relation["SPEC"]["NODE_NAME"]] = True
|
||||
visited_nodes[relation["SPEC"]["NBOR_NAME"]] = True
|
||||
idx = idx + 1
|
||||
|
||||
def add(self, key, patterns, *_patterns, on_match=None):
|
||||
"""Add a new matcher rule to the matcher.
|
||||
|
||||
key (str): The match ID.
|
||||
patterns (list): The patterns to add for the given key.
|
||||
on_match (callable): Optional callback executed on match.
|
||||
"""
|
||||
if patterns is None or hasattr(patterns, "__call__"): # old API
|
||||
on_match = patterns
|
||||
patterns = _patterns
|
||||
for pattern in patterns:
|
||||
if len(pattern) == 0:
|
||||
raise ValueError(Errors.E012.format(key=key))
|
||||
self.validateInput(pattern,key)
|
||||
self.validate_input(pattern,key)
|
||||
key = self._normalize_key(key)
|
||||
_patterns = []
|
||||
for pattern in patterns:
|
||||
|
@ -187,8 +193,7 @@ cdef class DependencyMatcher:
|
|||
key (string or int): The key to check.
|
||||
RETURNS (bool): Whether the matcher has the rule.
|
||||
"""
|
||||
key = self._normalize_key(key)
|
||||
return key in self._patterns
|
||||
return self._normalize_key(key) in self._patterns
|
||||
|
||||
def get(self, key, default=None):
|
||||
"""Retrieve the pattern stored for a key.
|
||||
|
@ -202,6 +207,13 @@ cdef class DependencyMatcher:
|
|||
return (self._callbacks[key], self._patterns[key])
|
||||
|
||||
def __call__(self, Doc doc):
|
||||
"""Find all token sequences matching the supplied pattern.
|
||||
|
||||
doclike (Doc or Span): The document to match over.
|
||||
RETURNS (list): A list of `(key, start, end)` tuples,
|
||||
describing the matches. A match tuple describes a span
|
||||
`doc[start:end]`. The `label_id` and `key` are both integers.
|
||||
"""
|
||||
matched_key_trees = []
|
||||
matches = self.token_matcher(doc)
|
||||
for key in list(self._patterns.keys()):
|
||||
|
@ -241,25 +253,25 @@ cdef class DependencyMatcher:
|
|||
on_match(self, doc, i, matched_key_trees)
|
||||
return matched_key_trees
|
||||
|
||||
def recurse(self,tree,id_to_position,_node_operator_map,int patternLength,visitedNodes,matched_trees):
|
||||
def recurse(self,tree,id_to_position,_node_operator_map,int patternLength,visited_nodes,matched_trees):
|
||||
cdef bool isValid;
|
||||
if(patternLength == len(id_to_position.keys())):
|
||||
isValid = True
|
||||
for node in range(patternLength):
|
||||
if(node in tree):
|
||||
for idx, (relop,nbor) in enumerate(tree[node]):
|
||||
computed_nbors = numpy.asarray(_node_operator_map[visitedNodes[node]][relop])
|
||||
computed_nbors = numpy.asarray(_node_operator_map[visited_nodes[node]][relop])
|
||||
isNbor = False
|
||||
for computed_nbor in computed_nbors:
|
||||
if(computed_nbor.i == visitedNodes[nbor]):
|
||||
if(computed_nbor.i == visited_nodes[nbor]):
|
||||
isNbor = True
|
||||
isValid = isValid & isNbor
|
||||
if(isValid):
|
||||
matched_trees.append(visitedNodes)
|
||||
matched_trees.append(visited_nodes)
|
||||
return
|
||||
allPatternNodes = numpy.asarray(id_to_position[patternLength])
|
||||
for patternNode in allPatternNodes:
|
||||
self.recurse(tree,id_to_position,_node_operator_map,patternLength+1,visitedNodes+[patternNode],matched_trees)
|
||||
self.recurse(tree,id_to_position,_node_operator_map,patternLength+1,visited_nodes+[patternNode],matched_trees)
|
||||
|
||||
# Given a node and an edge operator, to return the list of nodes
|
||||
# from the doc that belong to node+operator. This is used to store
|
||||
|
|
|
@ -70,7 +70,7 @@ cdef class Matcher:
|
|||
key (str): The match ID.
|
||||
RETURNS (bool): Whether the matcher contains rules for this match ID.
|
||||
"""
|
||||
return self._normalize_key(key) in self._patterns
|
||||
return self.has_key(key)
|
||||
|
||||
def add(self, key, patterns, *, on_match=None, greedy: str=None):
|
||||
"""Add a match-rule to the matcher. A match-rule consists of: an ID
|
||||
|
@ -162,8 +162,7 @@ cdef class Matcher:
|
|||
key (string or int): The key to check.
|
||||
RETURNS (bool): Whether the matcher has the rule.
|
||||
"""
|
||||
key = self._normalize_key(key)
|
||||
return key in self._patterns
|
||||
return self._normalize_key(key) in self._patterns
|
||||
|
||||
def get(self, key, default=None):
|
||||
"""Retrieve the pattern stored for a key.
|
||||
|
@ -179,7 +178,7 @@ cdef class Matcher:
|
|||
def pipe(self, docs, batch_size=1000, return_matches=False, as_tuples=False):
|
||||
"""Match a stream of documents, yielding them in turn.
|
||||
|
||||
docs (iterable): A stream of documents.
|
||||
docs (Iterable[Union[Doc, Span]]): A stream of documents or spans.
|
||||
batch_size (int): Number of documents to accumulate into a working set.
|
||||
return_matches (bool): Yield the match lists along with the docs, making
|
||||
results (doc, matches) tuples.
|
||||
|
|
|
@ -75,8 +75,8 @@ class Morphologizer(Tagger):
|
|||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
labels_morph (dict): TODO:
|
||||
labels_pos (dict): TODO:
|
||||
labels_morph (dict): Mapping of morph + POS tags to morph labels.
|
||||
labels_pos (dict): Mapping of morph + POS tags to POS tags.
|
||||
|
||||
DOCS: https://spacy.io/api/morphologizer#init
|
||||
"""
|
||||
|
|
|
@ -601,9 +601,7 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [overrides
|
|||
|
||||
## Pretrain {#pretrain new="2.1" tag="experimental"}
|
||||
|
||||
<!-- TODO: document new pretrain command and link to new pretraining docs -->
|
||||
|
||||
Pre-train the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline
|
||||
Pretrain the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline
|
||||
components on [raw text](/api/data-formats#pretrain), using an approximate
|
||||
language-modeling objective. Specifically, we load pretrained vectors, and train
|
||||
a component like a CNN, BiLSTM, etc to predict vectors which match the
|
||||
|
@ -611,7 +609,8 @@ pretrained ones. The weights are saved to a directory after each epoch. You can
|
|||
then include a **path to one of these pretrained weights files** in your
|
||||
[training config](/usage/training#config) as the `init_tok2vec` setting when you
|
||||
train your model. This technique may be especially helpful if you have little
|
||||
labelled data.
|
||||
labelled data. See the usage docs on [pretraining](/usage/training#pretraining)
|
||||
for more info.
|
||||
|
||||
<Infobox title="Changed in v3.0" variant="warning">
|
||||
|
||||
|
@ -634,8 +633,8 @@ $ python -m spacy pretrain [texts_loc] [output_dir] [config_path]
|
|||
| `output_dir` | positional | Directory to write models to on each epoch. |
|
||||
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
|
||||
| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
|
||||
| `--resume-path`, `-r` | option | TODO: |
|
||||
| `--epoch-resume`, `-er` | option | TODO: |
|
||||
| `--resume-path`, `-r` | option | Path to pretrained weights from which to resume pretraining. |
|
||||
| `--epoch-resume`, `-er` | option | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
| overrides | | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`. |
|
||||
| **CREATES** | weights | The pretrained weights that can be used to initialize `spacy train`. |
|
||||
|
|
|
@ -20,9 +20,9 @@ Config files define the training process and model pipeline and can be passed to
|
|||
[`spacy train`](/api/cli#train). They use
|
||||
[Thinc's configuration system](https://thinc.ai/docs/usage-config) under the
|
||||
hood. For details on how to use training configs, see the
|
||||
[usage documentation](/usage/training#config).
|
||||
|
||||
<!-- TODO: add details on getting started and init config -->
|
||||
[usage documentation](/usage/training#config). To get started with a blank
|
||||
config or fill a partial config with all defaults, you can use the
|
||||
[`init config`](/api/cli#init-config) command.
|
||||
|
||||
> #### What does the @ mean?
|
||||
>
|
||||
|
@ -52,8 +52,6 @@ your config and check that it's valid, you can run the
|
|||
|
||||
</Infobox>
|
||||
|
||||
<!-- TODO: once we know how we want to implement "starter config" workflow or outputting a full default config for the user, update this section with the command -->
|
||||
|
||||
### nlp {#config-nlp tag="section"}
|
||||
|
||||
> #### Example
|
||||
|
@ -154,8 +152,6 @@ This section is optional and defines settings and controls for
|
|||
[language model pretraining](/usage/training#pretraining). It's used when you
|
||||
run [`spacy pretrain`](/api/cli#pretrain).
|
||||
|
||||
<!-- TODO: complete -->
|
||||
|
||||
| Name | Type | Description | Default |
|
||||
| ---------------------------- | --------------------------------------------------- | ----------------------------------------------------------------------------- | --------------------------------------------------- |
|
||||
| `max_epochs` | int | Maximum number of epochs. | `1000` |
|
||||
|
|
|
@ -5,4 +5,194 @@ tag: class
|
|||
source: spacy/matcher/dependencymatcher.pyx
|
||||
---
|
||||
|
||||
TODO: write
|
||||
The `DependencyMatcher` follows the same API as the [`Matcher`](/api/matcher)
|
||||
and [`PhraseMatcher`](/api/phrasematcher) and lets you match on dependency trees
|
||||
using the
|
||||
[Semgrex syntax](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html).
|
||||
It requires a pretrained [`DependencyParser`](/api/parser) or other component
|
||||
that sets the `Token.dep` attribute.
|
||||
|
||||
## Pattern format {#patterns}
|
||||
|
||||
> ```json
|
||||
> ### Example
|
||||
> [
|
||||
> {
|
||||
> "SPEC": {"NODE_NAME": "founded"},
|
||||
> "PATTERN": {"ORTH": "founded"}
|
||||
> },
|
||||
> {
|
||||
> "SPEC": {
|
||||
> "NODE_NAME": "founder",
|
||||
> "NBOR_RELOP": ">",
|
||||
> "NBOR_NAME": "founded"
|
||||
> },
|
||||
> "PATTERN": {"DEP": "nsubj"}
|
||||
> },
|
||||
> {
|
||||
> "SPEC": {
|
||||
> "NODE_NAME": "object",
|
||||
> "NBOR_RELOP": ">",
|
||||
> "NBOR_NAME": "founded"
|
||||
> },
|
||||
> "PATTERN": {"DEP": "dobj"}
|
||||
> }
|
||||
> ]
|
||||
> ```
|
||||
|
||||
A pattern added to the `DependencyMatcher` consists of a list of dictionaries,
|
||||
with each dictionary describing a node to match. Each pattern should have the
|
||||
following top-level keys:
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | ---- | --------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `PATTERN` | dict | The token attributes to match in the same format as patterns provided to the regular token-based [`Matcher`](/api/matcher). |
|
||||
| `SPEC` | dict | The relationships of the nodes in the subtree that should be matched. |
|
||||
|
||||
The `SPEC` includes the following fields:
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | ---- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `NODE_NAME` | str | A unique name for this node to refer to it in other specs. |
|
||||
| `NBOR_RELOP` | str | A [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html) operator that describes how the two nodes are related. |
|
||||
| `NBOR_NAME` | str | The unique name of the node that this node is connected to. |
|
||||
|
||||
## DependencyMatcher.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
Create a rule-based `DependencyMatcher`.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.matcher import DependencyMatcher
|
||||
> matcher = DependencyMatcher(nlp.vocab)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------- | ------- | ------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. |
|
||||
|
||||
## DependencyMatcher.\_\call\_\_ {#call tag="method"}
|
||||
|
||||
Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.matcher import Matcher
|
||||
>
|
||||
> matcher = Matcher(nlp.vocab)
|
||||
> pattern = [
|
||||
> {"SPEC": {"NODE_NAME": "founded"}, "PATTERN": {"ORTH": "founded"}},
|
||||
> {"SPEC": {"NODE_NAME": "founder", "NBOR_RELOP": ">", "NBOR_NAME": "founded"}, "PATTERN": {"DEP": "nsubj"}},
|
||||
> ]
|
||||
> matcher.add("Founder", [pattern])
|
||||
> doc = nlp("Bill Gates founded Microsoft.")
|
||||
> matches = matcher(doc)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `doclike` | `Doc`/`Span` | The `Doc` or `Span` to match over. |
|
||||
| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. |
|
||||
|
||||
## DependencyMatcher.\_\_len\_\_ {#len tag="method"}
|
||||
|
||||
Get the number of rules (edges) added to the dependency matcher. Note that this
|
||||
only returns the number of rules (identical with the number of IDs), not the
|
||||
number of individual patterns.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> matcher = DependencyMatcher(nlp.vocab)
|
||||
> assert len(matcher) == 0
|
||||
> pattern = [
|
||||
> {"SPEC": {"NODE_NAME": "founded"}, "PATTERN": {"ORTH": "founded"}},
|
||||
> {"SPEC": {"NODE_NAME": "START_ENTITY", "NBOR_RELOP": ">", "NBOR_NAME": "founded"}, "PATTERN": {"DEP": "nsubj"}},
|
||||
> ]
|
||||
> matcher.add("Rule", [pattern])
|
||||
> assert len(matcher) == 1
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | -------------------- |
|
||||
| **RETURNS** | int | The number of rules. |
|
||||
|
||||
## DependencyMatcher.\_\_contains\_\_ {#contains tag="method"}
|
||||
|
||||
Check whether the matcher contains rules for a match ID.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> matcher = Matcher(nlp.vocab)
|
||||
> assert "Rule" not in matcher
|
||||
> matcher.add("Rule", [pattern])
|
||||
> assert "Rule" in matcher
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ----------------------------------------------------- |
|
||||
| `key` | str | The match ID. |
|
||||
| **RETURNS** | bool | Whether the matcher contains rules for this match ID. |
|
||||
|
||||
## DependencyMatcher.add {#add tag="method"}
|
||||
|
||||
Add a rule to the matcher, consisting of an ID key, one or more patterns, and an
|
||||
optional callback function to act on the matches. The callback function will
|
||||
receive the arguments `matcher`, `doc`, `i` and `matches`. If a pattern already
|
||||
exists for the given ID, the patterns will be extended. An `on_match` callback
|
||||
will be overwritten.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> def on_match(matcher, doc, id, matches):
|
||||
> print('Matched!', matches)
|
||||
>
|
||||
> matcher = Matcher(nlp.vocab)
|
||||
> matcher.add("TEST_PATTERNS", patterns)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ------------------ | --------------------------------------------------------------------------------------------- |
|
||||
| `match_id` | str | An ID for the thing you're matching. |
|
||||
| `patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. |
|
||||
| _keyword-only_ | | |
|
||||
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
|
||||
|
||||
## DependencyMatcher.remove {#remove tag="method"}
|
||||
|
||||
Remove a rule from the matcher. A `KeyError` is raised if the match ID does not
|
||||
exist.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> matcher.add("Rule", [pattern]])
|
||||
> assert "Rule" in matcher
|
||||
> matcher.remove("Rule")
|
||||
> assert "Rule" not in matcher
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----- | ---- | ------------------------- |
|
||||
| `key` | str | The ID of the match rule. |
|
||||
|
||||
## DependencyMatcher.get {#get tag="method"}
|
||||
|
||||
Retrieve the pattern stored for a key. Returns the rule as an
|
||||
`(on_match, patterns)` tuple containing the callback and available patterns.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> matcher.add("Rule", [pattern], on_match=on_match)
|
||||
> on_match, patterns = matcher.get("Rule")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | --------------------------------------------- |
|
||||
| `key` | str | The ID of the match rule. |
|
||||
| **RETURNS** | tuple | The rule, as an `(on_match, patterns)` tuple. |
|
||||
|
|
|
@ -5,6 +5,82 @@ tag: class
|
|||
source: spacy/matcher/matcher.pyx
|
||||
---
|
||||
|
||||
The `Matcher` lets you find words and phrases using rules describing their token
|
||||
attributes. Rules can refer to token annotations (like the text or
|
||||
part-of-speech tags), as well as lexical attributes like `Token.is_punct`.
|
||||
Applying the matcher to a [`Doc`](/api/doc) gives you access to the matched
|
||||
tokens in context. For in-depth examples and workflows for combining rules and
|
||||
statistical models, see the [usage guide](/usage/rule-based-matching) on
|
||||
rule-based matching.
|
||||
|
||||
## Pattern format {#patterns}
|
||||
|
||||
> ```json
|
||||
> ### Example
|
||||
> [
|
||||
> {"LOWER": "i"},
|
||||
> {"LEMMA": {"IN": ["like", "love"]}},
|
||||
> {"POS": "NOUN", "OP": "+"}
|
||||
> ]
|
||||
> ```
|
||||
|
||||
A pattern added to the `Matcher` consists of a list of dictionaries. Each
|
||||
dictionary describes **one token** and its attributes. The available token
|
||||
pattern keys correspond to a number of
|
||||
[`Token` attributes](/api/token#attributes). The supported attributes for
|
||||
rule-based matching are:
|
||||
|
||||
| Attribute | Type | Description |
|
||||
| -------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------ |
|
||||
| `ORTH` | str | The exact verbatim text of a token. |
|
||||
| `TEXT` <Tag variant="new">2.1</Tag> | str | The exact verbatim text of a token. |
|
||||
| `LOWER` | str | The lowercase form of the token text. |
|
||||
| `LENGTH` | int | The length of the token text. |
|
||||
| `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. |
|
||||
| `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. |
|
||||
| `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. |
|
||||
| `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. |
|
||||
| `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | str | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. |
|
||||
| `ENT_TYPE` | str | The token's entity label. |
|
||||
| `_` <Tag variant="new">2.1</Tag> | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). |
|
||||
| `OP` | str | Operator or quantifier to determine how often to match a token pattern. |
|
||||
|
||||
Operators and quantifiers define **how often** a token pattern should be
|
||||
matched:
|
||||
|
||||
> ```json
|
||||
> ### Example
|
||||
> [
|
||||
> {"POS": "ADJ", "OP": "*"},
|
||||
> {"POS": "NOUN", "OP": "+"}
|
||||
> ]
|
||||
> ```
|
||||
|
||||
| OP | Description |
|
||||
| --- | ---------------------------------------------------------------- |
|
||||
| `!` | Negate the pattern, by requiring it to match exactly 0 times. |
|
||||
| `?` | Make the pattern optional, by allowing it to match 0 or 1 times. |
|
||||
| `+` | Require the pattern to match 1 or more times. |
|
||||
| `*` | Allow the pattern to match zero or more times. |
|
||||
|
||||
Token patterns can also map to a **dictionary of properties** instead of a
|
||||
single value to indicate whether the expected value is a member of a list or how
|
||||
it compares to another value.
|
||||
|
||||
> ```json
|
||||
> ### Example
|
||||
> [
|
||||
> {"LEMMA": {"IN": ["like", "love", "enjoy"]}},
|
||||
> {"POS": "PROPN", "LENGTH": {">=": 10}},
|
||||
> ]
|
||||
> ```
|
||||
|
||||
| Attribute | Type | Description |
|
||||
| -------------------------- | ---------- | --------------------------------------------------------------------------------- |
|
||||
| `IN` | any | Attribute value is member of a list. |
|
||||
| `NOT_IN` | any | Attribute value is _not_ member of a list. |
|
||||
| `==`, `>=`, `<=`, `>`, `<` | int, float | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. |
|
||||
|
||||
## Matcher.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
Create the rule-based `Matcher`. If `validate=True` is set, all patterns added
|
||||
|
@ -60,7 +136,7 @@ Match a stream of documents, yielding them in turn.
|
|||
|
||||
| Name | Type | Description |
|
||||
| --------------------------------------------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `docs` | iterable | A stream of documents. |
|
||||
| `docs` | iterable | A stream of documents or spans. |
|
||||
| `batch_size` | int | The number of documents to accumulate into a working set. |
|
||||
| `return_matches` <Tag variant="new">2.1</Tag> | bool | Yield the match lists along with the docs, making results `(doc, matches)` tuples. |
|
||||
| `as_tuples` | bool | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. |
|
||||
|
@ -105,11 +181,11 @@ Check whether the matcher contains rules for a match ID.
|
|||
|
||||
## Matcher.add {#add tag="method" new="2"}
|
||||
|
||||
Add a rule to the matcher, consisting of an ID key, one or more patterns, and a
|
||||
callback function to act on the matches. The callback function will receive the
|
||||
arguments `matcher`, `doc`, `i` and `matches`. If a pattern already exists for
|
||||
the given ID, the patterns will be extended. An `on_match` callback will be
|
||||
overwritten.
|
||||
Add a rule to the matcher, consisting of an ID key, one or more patterns, and an
|
||||
optional callback function to act on the matches. The callback function will
|
||||
receive the arguments `matcher`, `doc`, `i` and `matches`. If a pattern already
|
||||
exists for the given ID, the patterns will be extended. An `on_match` callback
|
||||
will be overwritten.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -141,12 +217,13 @@ patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]]
|
|||
|
||||
</Infobox>
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ------------------ | --------------------------------------------------------------------------------------------- |
|
||||
| `match_id` | str | An ID for the thing you're matching. |
|
||||
| `patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. |
|
||||
| _keyword-only_ | | |
|
||||
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
|
||||
| Name | Type | Description |
|
||||
| ----------------------------------- | ------------------ | --------------------------------------------------------------------------------------------- |
|
||||
| `match_id` | str | An ID for the thing you're matching. |
|
||||
| `patterns` | `List[List[dict]]` | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. |
|
||||
| _keyword-only_ | | |
|
||||
| `on_match` | callable / `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
|
||||
| `greedy` <Tag variant="new">3</Tag> | str | Optional filter for greedy matches. Can either be `"FIRST"` or `"LONGEST"`. |
|
||||
|
||||
## Matcher.remove {#remove tag="method" new="2"}
|
||||
|
||||
|
|
|
@ -63,16 +63,14 @@ Create a new pipeline instance. In your application, you would normally use a
|
|||
shortcut for this and instantiate the component using its string name and
|
||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||
|
||||
<!-- TODO: finish API docs -->
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ------- | ------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||
| _keyword-only_ | | |
|
||||
| `labels_morph` | dict | |
|
||||
| `labels_pos` | dict | |
|
||||
| `labels_morph` | dict | Mapping of morph + POS tags to morph labels. |
|
||||
| `labels_pos` | dict | Mapping of morph + POS tags to POS tags. |
|
||||
|
||||
## Morphologizer.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
|
|
@ -9,7 +9,8 @@ new: 2
|
|||
The `PhraseMatcher` lets you efficiently match large terminology lists. While
|
||||
the [`Matcher`](/api/matcher) lets you match sequences based on lists of token
|
||||
descriptions, the `PhraseMatcher` accepts match patterns in the form of `Doc`
|
||||
objects.
|
||||
objects. See the [usage guide](/usage/rule-based-matching#phrasematcher) for
|
||||
examples.
|
||||
|
||||
## PhraseMatcher.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user