mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Expose tokenizer rules as a property
Expose the tokenizer rules property in the same way as the other core properties. (The cache resetting is overkill, but consistent with `from_bytes` for now.) Add tests and update Tokenizer API docs.
This commit is contained in:
parent
3ec231f7e1
commit
983c88d02e
|
@ -2,6 +2,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
import re
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.tokenizer import Tokenizer
|
from spacy.tokenizer import Tokenizer
|
||||||
from spacy.util import compile_prefix_regex, compile_suffix_regex
|
from spacy.util import compile_prefix_regex, compile_suffix_regex
|
||||||
|
@ -19,13 +20,14 @@ def custom_en_tokenizer(en_vocab):
|
||||||
r"[\[\]!&:,()\*—–\/-]",
|
r"[\[\]!&:,()\*—–\/-]",
|
||||||
]
|
]
|
||||||
infix_re = compile_infix_regex(custom_infixes)
|
infix_re = compile_infix_regex(custom_infixes)
|
||||||
|
token_match_re = re.compile("a-b")
|
||||||
return Tokenizer(
|
return Tokenizer(
|
||||||
en_vocab,
|
en_vocab,
|
||||||
English.Defaults.tokenizer_exceptions,
|
English.Defaults.tokenizer_exceptions,
|
||||||
prefix_re.search,
|
prefix_re.search,
|
||||||
suffix_re.search,
|
suffix_re.search,
|
||||||
infix_re.finditer,
|
infix_re.finditer,
|
||||||
token_match=None,
|
token_match=token_match_re.match,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -74,3 +76,81 @@ def test_en_customized_tokenizer_handles_infixes(custom_en_tokenizer):
|
||||||
"Megaregion",
|
"Megaregion",
|
||||||
".",
|
".",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_en_customized_tokenizer_handles_token_match(custom_en_tokenizer):
|
||||||
|
sentence = "The 8 and 10-county definitions a-b not used for the greater Southern California Megaregion."
|
||||||
|
context = [word.text for word in custom_en_tokenizer(sentence)]
|
||||||
|
assert context == [
|
||||||
|
"The",
|
||||||
|
"8",
|
||||||
|
"and",
|
||||||
|
"10",
|
||||||
|
"-",
|
||||||
|
"county",
|
||||||
|
"definitions",
|
||||||
|
"a-b",
|
||||||
|
"not",
|
||||||
|
"used",
|
||||||
|
"for",
|
||||||
|
"the",
|
||||||
|
"greater",
|
||||||
|
"Southern",
|
||||||
|
"California",
|
||||||
|
"Megaregion",
|
||||||
|
".",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_en_customized_tokenizer_handles_rules(custom_en_tokenizer):
|
||||||
|
sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion. :)"
|
||||||
|
context = [word.text for word in custom_en_tokenizer(sentence)]
|
||||||
|
assert context == [
|
||||||
|
"The",
|
||||||
|
"8",
|
||||||
|
"and",
|
||||||
|
"10",
|
||||||
|
"-",
|
||||||
|
"county",
|
||||||
|
"definitions",
|
||||||
|
"are",
|
||||||
|
"not",
|
||||||
|
"used",
|
||||||
|
"for",
|
||||||
|
"the",
|
||||||
|
"greater",
|
||||||
|
"Southern",
|
||||||
|
"California",
|
||||||
|
"Megaregion",
|
||||||
|
".",
|
||||||
|
":)",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_en_customized_tokenizer_handles_rules_property(custom_en_tokenizer):
|
||||||
|
sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion. :)"
|
||||||
|
rules = custom_en_tokenizer.rules
|
||||||
|
del rules[":)"]
|
||||||
|
custom_en_tokenizer.rules = rules
|
||||||
|
context = [word.text for word in custom_en_tokenizer(sentence)]
|
||||||
|
assert context == [
|
||||||
|
"The",
|
||||||
|
"8",
|
||||||
|
"and",
|
||||||
|
"10",
|
||||||
|
"-",
|
||||||
|
"county",
|
||||||
|
"definitions",
|
||||||
|
"are",
|
||||||
|
"not",
|
||||||
|
"used",
|
||||||
|
"for",
|
||||||
|
"the",
|
||||||
|
"greater",
|
||||||
|
"Southern",
|
||||||
|
"California",
|
||||||
|
"Megaregion",
|
||||||
|
".",
|
||||||
|
":",
|
||||||
|
")",
|
||||||
|
]
|
||||||
|
|
|
@ -93,6 +93,19 @@ cdef class Tokenizer:
|
||||||
self._infix_finditer = infix_finditer
|
self._infix_finditer = infix_finditer
|
||||||
self._flush_cache()
|
self._flush_cache()
|
||||||
|
|
||||||
|
property rules:
|
||||||
|
def __get__(self):
|
||||||
|
return self._rules
|
||||||
|
|
||||||
|
def __set__(self, rules):
|
||||||
|
self._rules = {}
|
||||||
|
self._reset_cache([key for key in self._cache])
|
||||||
|
self._reset_specials()
|
||||||
|
self._cache = PreshMap()
|
||||||
|
self._specials = PreshMap()
|
||||||
|
if rules is not None:
|
||||||
|
self._load_special_tokenization(rules)
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
args = (self.vocab,
|
args = (self.vocab,
|
||||||
self._rules,
|
self._rules,
|
||||||
|
|
|
@ -34,15 +34,15 @@ the
|
||||||
> tokenizer = nlp.Defaults.create_tokenizer(nlp)
|
> tokenizer = nlp.Defaults.create_tokenizer(nlp)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ---------------- | ----------- | ----------------------------------------------------------------------------------- |
|
| ---------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | `Vocab` | A storage container for lexical types. |
|
| `vocab` | `Vocab` | A storage container for lexical types. |
|
||||||
| `rules` | dict | Exceptions and special-cases for the tokenizer. |
|
| `rules` | dict | Exceptions and special-cases for the tokenizer. |
|
||||||
| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. |
|
| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. |
|
||||||
| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. |
|
| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. |
|
||||||
| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. |
|
| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. |
|
||||||
| `token_match` | callable | A boolean function matching strings to be recognized as tokens. |
|
| `token_match` | callable | A function matching the signature of `re.compile(string).match to find token matches. |
|
||||||
| **RETURNS** | `Tokenizer` | The newly constructed object. |
|
| **RETURNS** | `Tokenizer` | The newly constructed object. |
|
||||||
|
|
||||||
## Tokenizer.\_\_call\_\_ {#call tag="method"}
|
## Tokenizer.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
@ -198,12 +198,14 @@ it.
|
||||||
|
|
||||||
## Attributes {#attributes}
|
## Attributes {#attributes}
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ---------------- | ------- | -------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------- | ------- | --------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | `Vocab` | The vocab object of the parent `Doc`. |
|
| `vocab` | `Vocab` | The vocab object of the parent `Doc`. |
|
||||||
| `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. |
|
| `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. |
|
||||||
| `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. |
|
| `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. |
|
||||||
| `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. |
|
| `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. |
|
||||||
|
| `token_match` | - | A function matching the signature of `re.compile(string).match to find token matches. Returns an `re.MatchObject` or `None. |
|
||||||
|
| `rules` | dict | A dictionary of tokenizer exceptions and special cases. |
|
||||||
|
|
||||||
## Serialization fields {#serialization-fields}
|
## Serialization fields {#serialization-fields}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user