diff --git a/spacy/matcher/_schemas.py b/spacy/matcher/_schemas.py index 4ef7ae49a..44392edbe 100644 --- a/spacy/matcher/_schemas.py +++ b/spacy/matcher/_schemas.py @@ -174,6 +174,10 @@ TOKEN_PATTERN_SCHEMA = { "title": "Token is the first in a sentence", "$ref": "#/definitions/boolean_value", }, + "SPACY": { + "title": "Token has a trailing space", + "$ref": "#/definitions/boolean_value", + }, "LIKE_NUM": { "title": "Token resembles a number", "$ref": "#/definitions/boolean_value", diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 1873d3d2b..236f25130 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -440,6 +440,7 @@ def test_attr_pipeline_checks(en_vocab): ([{"IS_LEFT_PUNCT": True}], "``"), ([{"IS_RIGHT_PUNCT": True}], "''"), ([{"IS_STOP": True}], "the"), + ([{"SPACY": True}], "the"), ([{"LIKE_NUM": True}], "1"), ([{"LIKE_URL": True}], "http://example.com"), ([{"LIKE_EMAIL": True}], "mail@example.com"), diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index 7ad402df1..e7729b383 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -157,20 +157,21 @@ The available token pattern keys correspond to a number of [`Token` attributes](/api/token#attributes). The supported attributes for rule-based matching are: -| Attribute | Value Type |  Description | -| -------------------------------------- | ------------- | ------------------------------------------------------------------------------------------------------ | -| `ORTH` | unicode | The exact verbatim text of a token. | -| `TEXT` 2.1 | unicode | The exact verbatim text of a token. | -| `LOWER` | unicode | The lowercase form of the token text. | -| `LENGTH` | int | The length of the token text. | -| `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. | -| `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. | -| `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. | -| `IS_SENT_START` | bool | Token is start of sentence. | -| `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. | -| `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | unicode | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. Note that the values of these attributes are case-sensitive. For a list of available part-of-speech tags and dependency labels, see the [Annotation Specifications](/api/annotation).| -| `ENT_TYPE` | unicode | The token's entity label. | -| `_` 2.1 | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). | +| Attribute | Value Type |  Description | +| ------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `ORTH` | unicode | The exact verbatim text of a token. | +| `TEXT` 2.1 | unicode | The exact verbatim text of a token. | +| `LOWER` | unicode | The lowercase form of the token text. | +| `LENGTH` | int | The length of the token text. | +| `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. | +| `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. | +| `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. | +| `IS_SENT_START` | bool | Token is start of sentence. | +| `SPACY` | bool | Token has a trailing space. | +| `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. | +| `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | unicode | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. Note that the values of these attributes are case-sensitive. For a list of available part-of-speech tags and dependency labels, see the [Annotation Specifications](/api/annotation). | +| `ENT_TYPE` | unicode | The token's entity label. | +| `_` 2.1 | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). | @@ -1102,21 +1103,28 @@ powerful model packages with binary weights _and_ rules included! ### Using a large number of phrase patterns {#entityruler-large-phrase-patterns new="2.2.4"} -When using a large amount of **phrase patterns** (roughly > 10000) it's useful to understand how the `add_patterns` function of the EntityRuler works. For each **phrase pattern**, -the EntityRuler calls the nlp object to construct a doc object. This happens in case you try -to add the EntityRuler at the end of an existing pipeline with, for example, a POS tagger and want to -extract matches based on the pattern's POS signature. +When using a large amount of **phrase patterns** (roughly > 10000) it's useful +to understand how the `add_patterns` function of the EntityRuler works. For each +**phrase pattern**, the EntityRuler calls the nlp object to construct a doc +object. This happens in case you try to add the EntityRuler at the end of an +existing pipeline with, for example, a POS tagger and want to extract matches +based on the pattern's POS signature. -In this case you would pass a config value of `phrase_matcher_attr="POS"` for the EntityRuler. +In this case you would pass a config value of `phrase_matcher_attr="POS"` for +the EntityRuler. -Running the full language pipeline across every pattern in a large list scales linearly and can therefore take a long time on large amounts of phrase patterns. +Running the full language pipeline across every pattern in a large list scales +linearly and can therefore take a long time on large amounts of phrase patterns. -As of spaCy 2.2.4 the `add_patterns` function has been refactored to use nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with 5,000-100,000 phrase patterns respectively. +As of spaCy 2.2.4 the `add_patterns` function has been refactored to use +nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with +5,000-100,000 phrase patterns respectively. -Even with this speedup (but especially if you're using an older version) the `add_patterns` function can still take a long time. +Even with this speedup (but especially if you're using an older version) the +`add_patterns` function can still take a long time. -An easy workaround to make this function run faster is disabling the other language pipes -while adding the phrase patterns. +An easy workaround to make this function run faster is disabling the other +language pipes while adding the phrase patterns. ```python entityruler = EntityRuler(nlp)