From d5c78c7a34fc955c434129a0981ca23671b0cd6c Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 9 Aug 2020 22:31:52 +0200 Subject: [PATCH] Update docs and fix consistency --- spacy/cli/pretrain.py | 2 +- spacy/matcher/dependencymatcher.pyx | 44 +++--- spacy/matcher/matcher.pyx | 7 +- spacy/pipeline/morphologizer.pyx | 4 +- website/docs/api/cli.md | 11 +- website/docs/api/data-formats.md | 10 +- website/docs/api/dependencymatcher.md | 192 +++++++++++++++++++++++++- website/docs/api/matcher.md | 101 ++++++++++++-- website/docs/api/morphologizer.md | 6 +- website/docs/api/phrasematcher.md | 3 +- 10 files changed, 326 insertions(+), 54 deletions(-) diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 7202ccacf..ce0eb27a0 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -35,7 +35,7 @@ def pretrain_cli( config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False), code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"), - epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."), + epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), # fmt: on ): diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index 716af9909..e0a54e6f1 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -68,11 +68,11 @@ cdef class DependencyMatcher: key (str): The match ID. RETURNS (bool): Whether the matcher contains rules for this match ID. """ - return self._normalize_key(key) in self._patterns + return self.has_key(key) - def validateInput(self, pattern, key): + def validate_input(self, pattern, key): idx = 0 - visitedNodes = {} + visited_nodes = {} for relation in pattern: if "PATTERN" not in relation or "SPEC" not in relation: raise ValueError(Errors.E098.format(key=key)) @@ -83,7 +83,7 @@ cdef class DependencyMatcher: and "NBOR_NAME" not in relation["SPEC"] ): raise ValueError(Errors.E099.format(key=key)) - visitedNodes[relation["SPEC"]["NODE_NAME"]] = True + visited_nodes[relation["SPEC"]["NODE_NAME"]] = True else: if not( "NODE_NAME" in relation["SPEC"] @@ -92,22 +92,28 @@ cdef class DependencyMatcher: ): raise ValueError(Errors.E100.format(key=key)) if ( - relation["SPEC"]["NODE_NAME"] in visitedNodes - or relation["SPEC"]["NBOR_NAME"] not in visitedNodes + relation["SPEC"]["NODE_NAME"] in visited_nodes + or relation["SPEC"]["NBOR_NAME"] not in visited_nodes ): raise ValueError(Errors.E101.format(key=key)) - visitedNodes[relation["SPEC"]["NODE_NAME"]] = True - visitedNodes[relation["SPEC"]["NBOR_NAME"]] = True + visited_nodes[relation["SPEC"]["NODE_NAME"]] = True + visited_nodes[relation["SPEC"]["NBOR_NAME"]] = True idx = idx + 1 def add(self, key, patterns, *_patterns, on_match=None): + """Add a new matcher rule to the matcher. + + key (str): The match ID. + patterns (list): The patterns to add for the given key. + on_match (callable): Optional callback executed on match. + """ if patterns is None or hasattr(patterns, "__call__"): # old API on_match = patterns patterns = _patterns for pattern in patterns: if len(pattern) == 0: raise ValueError(Errors.E012.format(key=key)) - self.validateInput(pattern,key) + self.validate_input(pattern,key) key = self._normalize_key(key) _patterns = [] for pattern in patterns: @@ -187,8 +193,7 @@ cdef class DependencyMatcher: key (string or int): The key to check. RETURNS (bool): Whether the matcher has the rule. """ - key = self._normalize_key(key) - return key in self._patterns + return self._normalize_key(key) in self._patterns def get(self, key, default=None): """Retrieve the pattern stored for a key. @@ -202,6 +207,13 @@ cdef class DependencyMatcher: return (self._callbacks[key], self._patterns[key]) def __call__(self, Doc doc): + """Find all token sequences matching the supplied pattern. + + doclike (Doc or Span): The document to match over. + RETURNS (list): A list of `(key, start, end)` tuples, + describing the matches. A match tuple describes a span + `doc[start:end]`. The `label_id` and `key` are both integers. + """ matched_key_trees = [] matches = self.token_matcher(doc) for key in list(self._patterns.keys()): @@ -241,25 +253,25 @@ cdef class DependencyMatcher: on_match(self, doc, i, matched_key_trees) return matched_key_trees - def recurse(self,tree,id_to_position,_node_operator_map,int patternLength,visitedNodes,matched_trees): + def recurse(self,tree,id_to_position,_node_operator_map,int patternLength,visited_nodes,matched_trees): cdef bool isValid; if(patternLength == len(id_to_position.keys())): isValid = True for node in range(patternLength): if(node in tree): for idx, (relop,nbor) in enumerate(tree[node]): - computed_nbors = numpy.asarray(_node_operator_map[visitedNodes[node]][relop]) + computed_nbors = numpy.asarray(_node_operator_map[visited_nodes[node]][relop]) isNbor = False for computed_nbor in computed_nbors: - if(computed_nbor.i == visitedNodes[nbor]): + if(computed_nbor.i == visited_nodes[nbor]): isNbor = True isValid = isValid & isNbor if(isValid): - matched_trees.append(visitedNodes) + matched_trees.append(visited_nodes) return allPatternNodes = numpy.asarray(id_to_position[patternLength]) for patternNode in allPatternNodes: - self.recurse(tree,id_to_position,_node_operator_map,patternLength+1,visitedNodes+[patternNode],matched_trees) + self.recurse(tree,id_to_position,_node_operator_map,patternLength+1,visited_nodes+[patternNode],matched_trees) # Given a node and an edge operator, to return the list of nodes # from the doc that belong to node+operator. This is used to store diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index a0f3f1655..16ab73735 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -70,7 +70,7 @@ cdef class Matcher: key (str): The match ID. RETURNS (bool): Whether the matcher contains rules for this match ID. """ - return self._normalize_key(key) in self._patterns + return self.has_key(key) def add(self, key, patterns, *, on_match=None, greedy: str=None): """Add a match-rule to the matcher. A match-rule consists of: an ID @@ -162,8 +162,7 @@ cdef class Matcher: key (string or int): The key to check. RETURNS (bool): Whether the matcher has the rule. """ - key = self._normalize_key(key) - return key in self._patterns + return self._normalize_key(key) in self._patterns def get(self, key, default=None): """Retrieve the pattern stored for a key. @@ -179,7 +178,7 @@ cdef class Matcher: def pipe(self, docs, batch_size=1000, return_matches=False, as_tuples=False): """Match a stream of documents, yielding them in turn. - docs (iterable): A stream of documents. + docs (Iterable[Union[Doc, Span]]): A stream of documents or spans. batch_size (int): Number of documents to accumulate into a working set. return_matches (bool): Yield the match lists along with the docs, making results (doc, matches) tuples. diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 06c9f9a25..efc494181 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -75,8 +75,8 @@ class Morphologizer(Tagger): model (thinc.api.Model): The Thinc Model powering the pipeline component. name (str): The component instance name, used to add entries to the losses during training. - labels_morph (dict): TODO: - labels_pos (dict): TODO: + labels_morph (dict): Mapping of morph + POS tags to morph labels. + labels_pos (dict): Mapping of morph + POS tags to POS tags. DOCS: https://spacy.io/api/morphologizer#init """ diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 377b2456f..c4a774cd0 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -601,9 +601,7 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [overrides ## Pretrain {#pretrain new="2.1" tag="experimental"} - - -Pre-train the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline +Pretrain the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline components on [raw text](/api/data-formats#pretrain), using an approximate language-modeling objective. Specifically, we load pretrained vectors, and train a component like a CNN, BiLSTM, etc to predict vectors which match the @@ -611,7 +609,8 @@ pretrained ones. The weights are saved to a directory after each epoch. You can then include a **path to one of these pretrained weights files** in your [training config](/usage/training#config) as the `init_tok2vec` setting when you train your model. This technique may be especially helpful if you have little -labelled data. +labelled data. See the usage docs on [pretraining](/usage/training#pretraining) +for more info. @@ -634,8 +633,8 @@ $ python -m spacy pretrain [texts_loc] [output_dir] [config_path] | `output_dir` | positional | Directory to write models to on each epoch. | | `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. | | `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. | -| `--resume-path`, `-r` | option | TODO: | -| `--epoch-resume`, `-er` | option | TODO: | +| `--resume-path`, `-r` | option | Path to pretrained weights from which to resume pretraining. | +| `--epoch-resume`, `-er` | option | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. | | `--help`, `-h` | flag | Show help message and available arguments. | | overrides | | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`. | | **CREATES** | weights | The pretrained weights that can be used to initialize `spacy train`. | diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index c0a87756d..af7cb26de 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -20,9 +20,9 @@ Config files define the training process and model pipeline and can be passed to [`spacy train`](/api/cli#train). They use [Thinc's configuration system](https://thinc.ai/docs/usage-config) under the hood. For details on how to use training configs, see the -[usage documentation](/usage/training#config). - - +[usage documentation](/usage/training#config). To get started with a blank +config or fill a partial config with all defaults, you can use the +[`init config`](/api/cli#init-config) command. > #### What does the @ mean? > @@ -52,8 +52,6 @@ your config and check that it's valid, you can run the - - ### nlp {#config-nlp tag="section"} > #### Example @@ -154,8 +152,6 @@ This section is optional and defines settings and controls for [language model pretraining](/usage/training#pretraining). It's used when you run [`spacy pretrain`](/api/cli#pretrain). - - | Name | Type | Description | Default | | ---------------------------- | --------------------------------------------------- | ----------------------------------------------------------------------------- | --------------------------------------------------- | | `max_epochs` | int | Maximum number of epochs. | `1000` | diff --git a/website/docs/api/dependencymatcher.md b/website/docs/api/dependencymatcher.md index 3638575df..4f192783f 100644 --- a/website/docs/api/dependencymatcher.md +++ b/website/docs/api/dependencymatcher.md @@ -5,4 +5,194 @@ tag: class source: spacy/matcher/dependencymatcher.pyx --- -TODO: write +The `DependencyMatcher` follows the same API as the [`Matcher`](/api/matcher) +and [`PhraseMatcher`](/api/phrasematcher) and lets you match on dependency trees +using the +[Semgrex syntax](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html). +It requires a pretrained [`DependencyParser`](/api/parser) or other component +that sets the `Token.dep` attribute. + +## Pattern format {#patterns} + +> ```json +> ### Example +> [ +> { +> "SPEC": {"NODE_NAME": "founded"}, +> "PATTERN": {"ORTH": "founded"} +> }, +> { +> "SPEC": { +> "NODE_NAME": "founder", +> "NBOR_RELOP": ">", +> "NBOR_NAME": "founded" +> }, +> "PATTERN": {"DEP": "nsubj"} +> }, +> { +> "SPEC": { +> "NODE_NAME": "object", +> "NBOR_RELOP": ">", +> "NBOR_NAME": "founded" +> }, +> "PATTERN": {"DEP": "dobj"} +> } +> ] +> ``` + +A pattern added to the `DependencyMatcher` consists of a list of dictionaries, +with each dictionary describing a node to match. Each pattern should have the +following top-level keys: + +| Name | Type | Description | +| --------- | ---- | --------------------------------------------------------------------------------------------------------------------------- | +| `PATTERN` | dict | The token attributes to match in the same format as patterns provided to the regular token-based [`Matcher`](/api/matcher). | +| `SPEC` | dict | The relationships of the nodes in the subtree that should be matched. | + +The `SPEC` includes the following fields: + +| Name | Type | Description | +| ------------ | ---- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `NODE_NAME` | str | A unique name for this node to refer to it in other specs. | +| `NBOR_RELOP` | str | A [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html) operator that describes how the two nodes are related. | +| `NBOR_NAME` | str | The unique name of the node that this node is connected to. | + +## DependencyMatcher.\_\_init\_\_ {#init tag="method"} + +Create a rule-based `DependencyMatcher`. + +> #### Example +> +> ```python +> from spacy.matcher import DependencyMatcher +> matcher = DependencyMatcher(nlp.vocab) +> ``` + +| Name | Type | Description | +| ------- | ------- | ------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. | + +## DependencyMatcher.\_\call\_\_ {#call tag="method"} + +Find all token sequences matching the supplied patterns on the `Doc` or `Span`. + +> #### Example +> +> ```python +> from spacy.matcher import Matcher +> +> matcher = Matcher(nlp.vocab) +> pattern = [ +> {"SPEC": {"NODE_NAME": "founded"}, "PATTERN": {"ORTH": "founded"}}, +> {"SPEC": {"NODE_NAME": "founder", "NBOR_RELOP": ">", "NBOR_NAME": "founded"}, "PATTERN": {"DEP": "nsubj"}}, +> ] +> matcher.add("Founder", [pattern]) +> doc = nlp("Bill Gates founded Microsoft.") +> matches = matcher(doc) +> ``` + +| Name | Type | Description | +| ----------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `doclike` | `Doc`/`Span` | The `Doc` or `Span` to match over. | +| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. | + +## DependencyMatcher.\_\_len\_\_ {#len tag="method"} + +Get the number of rules (edges) added to the dependency matcher. Note that this +only returns the number of rules (identical with the number of IDs), not the +number of individual patterns. + +> #### Example +> +> ```python +> matcher = DependencyMatcher(nlp.vocab) +> assert len(matcher) == 0 +> pattern = [ +> {"SPEC": {"NODE_NAME": "founded"}, "PATTERN": {"ORTH": "founded"}}, +> {"SPEC": {"NODE_NAME": "START_ENTITY", "NBOR_RELOP": ">", "NBOR_NAME": "founded"}, "PATTERN": {"DEP": "nsubj"}}, +> ] +> matcher.add("Rule", [pattern]) +> assert len(matcher) == 1 +> ``` + +| Name | Type | Description | +| ----------- | ---- | -------------------- | +| **RETURNS** | int | The number of rules. | + +## DependencyMatcher.\_\_contains\_\_ {#contains tag="method"} + +Check whether the matcher contains rules for a match ID. + +> #### Example +> +> ```python +> matcher = Matcher(nlp.vocab) +> assert "Rule" not in matcher +> matcher.add("Rule", [pattern]) +> assert "Rule" in matcher +> ``` + +| Name | Type | Description | +| ----------- | ---- | ----------------------------------------------------- | +| `key` | str | The match ID. | +| **RETURNS** | bool | Whether the matcher contains rules for this match ID. | + +## DependencyMatcher.add {#add tag="method"} + +Add a rule to the matcher, consisting of an ID key, one or more patterns, and an +optional callback function to act on the matches. The callback function will +receive the arguments `matcher`, `doc`, `i` and `matches`. If a pattern already +exists for the given ID, the patterns will be extended. An `on_match` callback +will be overwritten. + +> #### Example +> +> ```python +> def on_match(matcher, doc, id, matches): +> print('Matched!', matches) +> +> matcher = Matcher(nlp.vocab) +> matcher.add("TEST_PATTERNS", patterns) +> ``` + +| Name | Type | Description | +| -------------- | ------------------ | --------------------------------------------------------------------------------------------- | +| `match_id` | str | An ID for the thing you're matching. | +| `patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. | +| _keyword-only_ | | | +| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | + +## DependencyMatcher.remove {#remove tag="method"} + +Remove a rule from the matcher. A `KeyError` is raised if the match ID does not +exist. + +> #### Example +> +> ```python +> matcher.add("Rule", [pattern]]) +> assert "Rule" in matcher +> matcher.remove("Rule") +> assert "Rule" not in matcher +> ``` + +| Name | Type | Description | +| ----- | ---- | ------------------------- | +| `key` | str | The ID of the match rule. | + +## DependencyMatcher.get {#get tag="method"} + +Retrieve the pattern stored for a key. Returns the rule as an +`(on_match, patterns)` tuple containing the callback and available patterns. + +> #### Example +> +> ```python +> matcher.add("Rule", [pattern], on_match=on_match) +> on_match, patterns = matcher.get("Rule") +> ``` + +| Name | Type | Description | +| ----------- | ----- | --------------------------------------------- | +| `key` | str | The ID of the match rule. | +| **RETURNS** | tuple | The rule, as an `(on_match, patterns)` tuple. | diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md index 925c9ad2e..b481f1972 100644 --- a/website/docs/api/matcher.md +++ b/website/docs/api/matcher.md @@ -5,6 +5,82 @@ tag: class source: spacy/matcher/matcher.pyx --- +The `Matcher` lets you find words and phrases using rules describing their token +attributes. Rules can refer to token annotations (like the text or +part-of-speech tags), as well as lexical attributes like `Token.is_punct`. +Applying the matcher to a [`Doc`](/api/doc) gives you access to the matched +tokens in context. For in-depth examples and workflows for combining rules and +statistical models, see the [usage guide](/usage/rule-based-matching) on +rule-based matching. + +## Pattern format {#patterns} + +> ```json +> ### Example +> [ +> {"LOWER": "i"}, +> {"LEMMA": {"IN": ["like", "love"]}}, +> {"POS": "NOUN", "OP": "+"} +> ] +> ``` + +A pattern added to the `Matcher` consists of a list of dictionaries. Each +dictionary describes **one token** and its attributes. The available token +pattern keys correspond to a number of +[`Token` attributes](/api/token#attributes). The supported attributes for +rule-based matching are: + +| Attribute | Type |  Description | +| -------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------ | +| `ORTH` | str | The exact verbatim text of a token. | +| `TEXT` 2.1 | str | The exact verbatim text of a token. | +| `LOWER` | str | The lowercase form of the token text. | +|  `LENGTH` | int | The length of the token text. | +|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. | +|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. | +|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. | +|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. | +|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | str | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. | +| `ENT_TYPE` | str | The token's entity label. | +| `_` 2.1 | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). | +| `OP` | str | Operator or quantifier to determine how often to match a token pattern. | + +Operators and quantifiers define **how often** a token pattern should be +matched: + +> ```json +> ### Example +> [ +> {"POS": "ADJ", "OP": "*"}, +> {"POS": "NOUN", "OP": "+"} +> ] +> ``` + +| OP | Description | +| --- | ---------------------------------------------------------------- | +| `!` | Negate the pattern, by requiring it to match exactly 0 times. | +| `?` | Make the pattern optional, by allowing it to match 0 or 1 times. | +| `+` | Require the pattern to match 1 or more times. | +| `*` | Allow the pattern to match zero or more times. | + +Token patterns can also map to a **dictionary of properties** instead of a +single value to indicate whether the expected value is a member of a list or how +it compares to another value. + +> ```json +> ### Example +> [ +> {"LEMMA": {"IN": ["like", "love", "enjoy"]}}, +> {"POS": "PROPN", "LENGTH": {">=": 10}}, +> ] +> ``` + +| Attribute | Type | Description | +| -------------------------- | ---------- | --------------------------------------------------------------------------------- | +| `IN` | any | Attribute value is member of a list. | +| `NOT_IN` | any | Attribute value is _not_ member of a list. | +| `==`, `>=`, `<=`, `>`, `<` | int, float | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. | + ## Matcher.\_\_init\_\_ {#init tag="method"} Create the rule-based `Matcher`. If `validate=True` is set, all patterns added @@ -60,7 +136,7 @@ Match a stream of documents, yielding them in turn. | Name | Type | Description | | --------------------------------------------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `docs` | iterable | A stream of documents. | +| `docs` | iterable | A stream of documents or spans. | | `batch_size` | int | The number of documents to accumulate into a working set. | | `return_matches` 2.1 | bool | Yield the match lists along with the docs, making results `(doc, matches)` tuples. | | `as_tuples` | bool | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. | @@ -105,11 +181,11 @@ Check whether the matcher contains rules for a match ID. ## Matcher.add {#add tag="method" new="2"} -Add a rule to the matcher, consisting of an ID key, one or more patterns, and a -callback function to act on the matches. The callback function will receive the -arguments `matcher`, `doc`, `i` and `matches`. If a pattern already exists for -the given ID, the patterns will be extended. An `on_match` callback will be -overwritten. +Add a rule to the matcher, consisting of an ID key, one or more patterns, and an +optional callback function to act on the matches. The callback function will +receive the arguments `matcher`, `doc`, `i` and `matches`. If a pattern already +exists for the given ID, the patterns will be extended. An `on_match` callback +will be overwritten. > #### Example > @@ -141,12 +217,13 @@ patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]] -| Name | Type | Description | -| -------------- | ------------------ | --------------------------------------------------------------------------------------------- | -| `match_id` | str | An ID for the thing you're matching. | -| `patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. | -| _keyword-only_ | | | -| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | +| Name | Type | Description | +| ----------------------------------- | ------------------ | --------------------------------------------------------------------------------------------- | +| `match_id` | str | An ID for the thing you're matching. | +| `patterns` | `List[List[dict]]` | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. | +| _keyword-only_ | | | +| `on_match` | callable / `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | +| `greedy` 3 | str | Optional filter for greedy matches. Can either be `"FIRST"` or `"LONGEST"`. | ## Matcher.remove {#remove tag="method" new="2"} diff --git a/website/docs/api/morphologizer.md b/website/docs/api/morphologizer.md index bfe5c3c77..942440234 100644 --- a/website/docs/api/morphologizer.md +++ b/website/docs/api/morphologizer.md @@ -63,16 +63,14 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#add_pipe). - - | Name | Type | Description | | -------------- | ------- | ------------------------------------------------------------------------------------------- | | `vocab` | `Vocab` | The shared vocabulary. | | `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | | `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | | _keyword-only_ | | | -| `labels_morph` | dict | | -| `labels_pos` | dict | | +| `labels_morph` | dict | Mapping of morph + POS tags to morph labels. | +| `labels_pos` | dict | Mapping of morph + POS tags to POS tags. | ## Morphologizer.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/phrasematcher.md b/website/docs/api/phrasematcher.md index 866aca096..71c7a463b 100644 --- a/website/docs/api/phrasematcher.md +++ b/website/docs/api/phrasematcher.md @@ -9,7 +9,8 @@ new: 2 The `PhraseMatcher` lets you efficiently match large terminology lists. While the [`Matcher`](/api/matcher) lets you match sequences based on lists of token descriptions, the `PhraseMatcher` accepts match patterns in the form of `Doc` -objects. +objects. See the [usage guide](/usage/rule-based-matching#phrasematcher) for +examples. ## PhraseMatcher.\_\_init\_\_ {#init tag="method"}