mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Expose tokenizer rules as a property
Expose the tokenizer rules property in the same way as the other core properties. (The cache resetting is overkill, but consistent with `from_bytes` for now.) Add tests and update Tokenizer API docs.
This commit is contained in:
parent
3ec231f7e1
commit
983c88d02e
|
@ -2,6 +2,7 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
import re
|
||||
from spacy.lang.en import English
|
||||
from spacy.tokenizer import Tokenizer
|
||||
from spacy.util import compile_prefix_regex, compile_suffix_regex
|
||||
|
@ -19,13 +20,14 @@ def custom_en_tokenizer(en_vocab):
|
|||
r"[\[\]!&:,()\*—–\/-]",
|
||||
]
|
||||
infix_re = compile_infix_regex(custom_infixes)
|
||||
token_match_re = re.compile("a-b")
|
||||
return Tokenizer(
|
||||
en_vocab,
|
||||
English.Defaults.tokenizer_exceptions,
|
||||
prefix_re.search,
|
||||
suffix_re.search,
|
||||
infix_re.finditer,
|
||||
token_match=None,
|
||||
token_match=token_match_re.match,
|
||||
)
|
||||
|
||||
|
||||
|
@ -74,3 +76,81 @@ def test_en_customized_tokenizer_handles_infixes(custom_en_tokenizer):
|
|||
"Megaregion",
|
||||
".",
|
||||
]
|
||||
|
||||
|
||||
def test_en_customized_tokenizer_handles_token_match(custom_en_tokenizer):
|
||||
sentence = "The 8 and 10-county definitions a-b not used for the greater Southern California Megaregion."
|
||||
context = [word.text for word in custom_en_tokenizer(sentence)]
|
||||
assert context == [
|
||||
"The",
|
||||
"8",
|
||||
"and",
|
||||
"10",
|
||||
"-",
|
||||
"county",
|
||||
"definitions",
|
||||
"a-b",
|
||||
"not",
|
||||
"used",
|
||||
"for",
|
||||
"the",
|
||||
"greater",
|
||||
"Southern",
|
||||
"California",
|
||||
"Megaregion",
|
||||
".",
|
||||
]
|
||||
|
||||
|
||||
def test_en_customized_tokenizer_handles_rules(custom_en_tokenizer):
|
||||
sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion. :)"
|
||||
context = [word.text for word in custom_en_tokenizer(sentence)]
|
||||
assert context == [
|
||||
"The",
|
||||
"8",
|
||||
"and",
|
||||
"10",
|
||||
"-",
|
||||
"county",
|
||||
"definitions",
|
||||
"are",
|
||||
"not",
|
||||
"used",
|
||||
"for",
|
||||
"the",
|
||||
"greater",
|
||||
"Southern",
|
||||
"California",
|
||||
"Megaregion",
|
||||
".",
|
||||
":)",
|
||||
]
|
||||
|
||||
|
||||
def test_en_customized_tokenizer_handles_rules_property(custom_en_tokenizer):
|
||||
sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion. :)"
|
||||
rules = custom_en_tokenizer.rules
|
||||
del rules[":)"]
|
||||
custom_en_tokenizer.rules = rules
|
||||
context = [word.text for word in custom_en_tokenizer(sentence)]
|
||||
assert context == [
|
||||
"The",
|
||||
"8",
|
||||
"and",
|
||||
"10",
|
||||
"-",
|
||||
"county",
|
||||
"definitions",
|
||||
"are",
|
||||
"not",
|
||||
"used",
|
||||
"for",
|
||||
"the",
|
||||
"greater",
|
||||
"Southern",
|
||||
"California",
|
||||
"Megaregion",
|
||||
".",
|
||||
":",
|
||||
")",
|
||||
]
|
||||
|
|
|
@ -93,6 +93,19 @@ cdef class Tokenizer:
|
|||
self._infix_finditer = infix_finditer
|
||||
self._flush_cache()
|
||||
|
||||
property rules:
|
||||
def __get__(self):
|
||||
return self._rules
|
||||
|
||||
def __set__(self, rules):
|
||||
self._rules = {}
|
||||
self._reset_cache([key for key in self._cache])
|
||||
self._reset_specials()
|
||||
self._cache = PreshMap()
|
||||
self._specials = PreshMap()
|
||||
if rules is not None:
|
||||
self._load_special_tokenization(rules)
|
||||
|
||||
def __reduce__(self):
|
||||
args = (self.vocab,
|
||||
self._rules,
|
||||
|
|
|
@ -34,15 +34,15 @@ the
|
|||
> tokenizer = nlp.Defaults.create_tokenizer(nlp)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------------- | ----------- | ----------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | A storage container for lexical types. |
|
||||
| `rules` | dict | Exceptions and special-cases for the tokenizer. |
|
||||
| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. |
|
||||
| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. |
|
||||
| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. |
|
||||
| `token_match` | callable | A boolean function matching strings to be recognized as tokens. |
|
||||
| **RETURNS** | `Tokenizer` | The newly constructed object. |
|
||||
| Name | Type | Description |
|
||||
| ---------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | A storage container for lexical types. |
|
||||
| `rules` | dict | Exceptions and special-cases for the tokenizer. |
|
||||
| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. |
|
||||
| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. |
|
||||
| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. |
|
||||
| `token_match` | callable | A function matching the signature of `re.compile(string).match to find token matches. |
|
||||
| **RETURNS** | `Tokenizer` | The newly constructed object. |
|
||||
|
||||
## Tokenizer.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
@ -198,12 +198,14 @@ it.
|
|||
|
||||
## Attributes {#attributes}
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------------- | ------- | -------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The vocab object of the parent `Doc`. |
|
||||
| `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. |
|
||||
| `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. |
|
||||
| `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. |
|
||||
| Name | Type | Description |
|
||||
| ---------------- | ------- | --------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The vocab object of the parent `Doc`. |
|
||||
| `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. |
|
||||
| `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. |
|
||||
| `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. |
|
||||
| `token_match` | - | A function matching the signature of `re.compile(string).match to find token matches. Returns an `re.MatchObject` or `None. |
|
||||
| `rules` | dict | A dictionary of tokenizer exceptions and special cases. |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user