mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-28 10:14:07 +03:00
Merge remote-tracking branch 'upstream/v4' into store-activations
This commit is contained in:
commit
3937abd2e7
4
.github/azure-steps.yml
vendored
4
.github/azure-steps.yml
vendored
|
@ -54,12 +54,12 @@ steps:
|
||||||
condition: eq(${{ parameters.gpu }}, true)
|
condition: eq(${{ parameters.gpu }}, true)
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
${{ parameters.prefix }} python -m pytest --pyargs spacy
|
${{ parameters.prefix }} python -m pytest --pyargs spacy -W error
|
||||||
displayName: "Run CPU tests"
|
displayName: "Run CPU tests"
|
||||||
condition: eq(${{ parameters.gpu }}, false)
|
condition: eq(${{ parameters.gpu }}, false)
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
${{ parameters.prefix }} python -m pytest --pyargs spacy -p spacy.tests.enable_gpu
|
${{ parameters.prefix }} python -m pytest --pyargs spacy -W error -p spacy.tests.enable_gpu
|
||||||
displayName: "Run GPU tests"
|
displayName: "Run GPU tests"
|
||||||
condition: eq(${{ parameters.gpu }}, true)
|
condition: eq(${{ parameters.gpu }}, true)
|
||||||
|
|
||||||
|
|
13
.github/no-response.yml
vendored
13
.github/no-response.yml
vendored
|
@ -1,13 +0,0 @@
|
||||||
# Configuration for probot-no-response - https://github.com/probot/no-response
|
|
||||||
|
|
||||||
# Number of days of inactivity before an Issue is closed for lack of response
|
|
||||||
daysUntilClose: 14
|
|
||||||
# Label requiring a response
|
|
||||||
responseRequiredLabel: more-info-needed
|
|
||||||
# Comment to post when closing an Issue for lack of response. Set to `false` to disable
|
|
||||||
closeComment: >
|
|
||||||
This issue has been automatically closed because there has been no response
|
|
||||||
to a request for more information from the original author. With only the
|
|
||||||
information that is currently in the issue, there's not enough information
|
|
||||||
to take action. If you're the original author, feel free to reopen the issue
|
|
||||||
if you have or find the answers needed to investigate further.
|
|
8
.github/workflows/issue-manager.yml
vendored
8
.github/workflows/issue-manager.yml
vendored
|
@ -15,7 +15,7 @@ jobs:
|
||||||
issue-manager:
|
issue-manager:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: tiangolo/issue-manager@0.2.1
|
- uses: tiangolo/issue-manager@0.4.0
|
||||||
with:
|
with:
|
||||||
token: ${{ secrets.GITHUB_TOKEN }}
|
token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
config: >
|
config: >
|
||||||
|
@ -25,5 +25,11 @@ jobs:
|
||||||
"message": "This issue has been automatically closed because it was answered and there was no follow-up discussion.",
|
"message": "This issue has been automatically closed because it was answered and there was no follow-up discussion.",
|
||||||
"remove_label_on_comment": true,
|
"remove_label_on_comment": true,
|
||||||
"remove_label_on_close": true
|
"remove_label_on_close": true
|
||||||
|
},
|
||||||
|
"more-info-needed": {
|
||||||
|
"delay": "P7D",
|
||||||
|
"message": "This issue has been automatically closed because there has been no response to a request for more information from the original author. With only the information that is currently in the issue, there's not enough information to take action. If you're the original author, feel free to reopen the issue if you have or find the answers needed to investigate further.",
|
||||||
|
"remove_label_on_comment": true,
|
||||||
|
"remove_label_on_close": true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -32,7 +32,7 @@ jobs:
|
||||||
versionSpec: "3.7"
|
versionSpec: "3.7"
|
||||||
- script: |
|
- script: |
|
||||||
pip install flake8==3.9.2
|
pip install flake8==3.9.2
|
||||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
|
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
||||||
displayName: "flake8"
|
displayName: "flake8"
|
||||||
|
|
||||||
- job: "Test"
|
- job: "Test"
|
||||||
|
|
|
@ -191,6 +191,8 @@ def load_model(name: str) -> "Language":
|
||||||
...
|
...
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Note that we typically put the `from typing` import statements on the first line(s) of the Python module.
|
||||||
|
|
||||||
## Structuring logic
|
## Structuring logic
|
||||||
|
|
||||||
### Positional and keyword arguments
|
### Positional and keyword arguments
|
||||||
|
@ -275,6 +277,27 @@ If you have to use `try`/`except`, make sure to only include what's **absolutely
|
||||||
+ return [v.strip() for v in value.split(",")]
|
+ return [v.strip() for v in value.split(",")]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Numeric comparisons
|
||||||
|
|
||||||
|
For numeric comparisons, as a general rule we always use `<` and `>=` and avoid the usage of `<=` and `>`. This is to ensure we consistently
|
||||||
|
apply inclusive lower bounds and exclusive upper bounds, helping to prevent off-by-one errors.
|
||||||
|
|
||||||
|
One exception to this rule is the ternary case. With a chain like
|
||||||
|
|
||||||
|
```python
|
||||||
|
if value >= 0 and value < max:
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
it's fine to rewrite this to the shorter form
|
||||||
|
|
||||||
|
```python
|
||||||
|
if 0 <= value < max:
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
even though this requires the usage of the `<=` operator.
|
||||||
|
|
||||||
### Iteration and comprehensions
|
### Iteration and comprehensions
|
||||||
|
|
||||||
We generally avoid using built-in functions like `filter` or `map` in favor of list or generator comprehensions.
|
We generally avoid using built-in functions like `filter` or `map` in favor of list or generator comprehensions.
|
||||||
|
|
10
setup.cfg
10
setup.cfg
|
@ -31,14 +31,6 @@ project_urls =
|
||||||
zip_safe = false
|
zip_safe = false
|
||||||
include_package_data = true
|
include_package_data = true
|
||||||
python_requires = >=3.6
|
python_requires = >=3.6
|
||||||
setup_requires =
|
|
||||||
cython>=0.25,<3.0
|
|
||||||
numpy>=1.15.0
|
|
||||||
# We also need our Cython packages here to compile against
|
|
||||||
cymem>=2.0.2,<2.1.0
|
|
||||||
preshed>=3.0.2,<3.1.0
|
|
||||||
murmurhash>=0.28.0,<1.1.0
|
|
||||||
thinc>=8.1.0,<8.2.0
|
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
spacy-legacy>=3.0.9,<3.1.0
|
spacy-legacy>=3.0.9,<3.1.0
|
||||||
|
@ -114,7 +106,7 @@ ja =
|
||||||
sudachipy>=0.5.2,!=0.6.1
|
sudachipy>=0.5.2,!=0.6.1
|
||||||
sudachidict_core>=20211220
|
sudachidict_core>=20211220
|
||||||
ko =
|
ko =
|
||||||
natto-py>=0.9.0
|
mecab-ko>=1.0.0
|
||||||
th =
|
th =
|
||||||
pythainlp>=2.0
|
pythainlp>=2.0
|
||||||
|
|
||||||
|
|
|
@ -97,7 +97,7 @@ NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
|
||||||
locals().update(IDS)
|
locals().update(IDS)
|
||||||
|
|
||||||
|
|
||||||
def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
def intify_attrs(stringy_attrs, strings_map=None):
|
||||||
"""
|
"""
|
||||||
Normalize a dictionary of attributes, converting them to ints.
|
Normalize a dictionary of attributes, converting them to ints.
|
||||||
|
|
||||||
|
@ -109,75 +109,6 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
||||||
converted to ints.
|
converted to ints.
|
||||||
"""
|
"""
|
||||||
inty_attrs = {}
|
inty_attrs = {}
|
||||||
if _do_deprecated:
|
|
||||||
if "F" in stringy_attrs:
|
|
||||||
stringy_attrs["ORTH"] = stringy_attrs.pop("F")
|
|
||||||
if "L" in stringy_attrs:
|
|
||||||
stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
|
|
||||||
if "pos" in stringy_attrs:
|
|
||||||
stringy_attrs["TAG"] = stringy_attrs.pop("pos")
|
|
||||||
if "morph" in stringy_attrs:
|
|
||||||
morphs = stringy_attrs.pop("morph")
|
|
||||||
if "number" in stringy_attrs:
|
|
||||||
stringy_attrs.pop("number")
|
|
||||||
if "tenspect" in stringy_attrs:
|
|
||||||
stringy_attrs.pop("tenspect")
|
|
||||||
morph_keys = [
|
|
||||||
"PunctType",
|
|
||||||
"PunctSide",
|
|
||||||
"Other",
|
|
||||||
"Degree",
|
|
||||||
"AdvType",
|
|
||||||
"Number",
|
|
||||||
"VerbForm",
|
|
||||||
"PronType",
|
|
||||||
"Aspect",
|
|
||||||
"Tense",
|
|
||||||
"PartType",
|
|
||||||
"Poss",
|
|
||||||
"Hyph",
|
|
||||||
"ConjType",
|
|
||||||
"NumType",
|
|
||||||
"Foreign",
|
|
||||||
"VerbType",
|
|
||||||
"NounType",
|
|
||||||
"Gender",
|
|
||||||
"Mood",
|
|
||||||
"Negative",
|
|
||||||
"Tense",
|
|
||||||
"Voice",
|
|
||||||
"Abbr",
|
|
||||||
"Derivation",
|
|
||||||
"Echo",
|
|
||||||
"Foreign",
|
|
||||||
"NameType",
|
|
||||||
"NounType",
|
|
||||||
"NumForm",
|
|
||||||
"NumValue",
|
|
||||||
"PartType",
|
|
||||||
"Polite",
|
|
||||||
"StyleVariant",
|
|
||||||
"PronType",
|
|
||||||
"AdjType",
|
|
||||||
"Person",
|
|
||||||
"Variant",
|
|
||||||
"AdpType",
|
|
||||||
"Reflex",
|
|
||||||
"Negative",
|
|
||||||
"Mood",
|
|
||||||
"Aspect",
|
|
||||||
"Case",
|
|
||||||
"Polarity",
|
|
||||||
"PrepCase",
|
|
||||||
"Animacy", # U20
|
|
||||||
]
|
|
||||||
for key in morph_keys:
|
|
||||||
if key in stringy_attrs:
|
|
||||||
stringy_attrs.pop(key)
|
|
||||||
elif key.lower() in stringy_attrs:
|
|
||||||
stringy_attrs.pop(key.lower())
|
|
||||||
elif key.upper() in stringy_attrs:
|
|
||||||
stringy_attrs.pop(key.upper())
|
|
||||||
for name, value in stringy_attrs.items():
|
for name, value in stringy_attrs.items():
|
||||||
int_key = intify_attr(name)
|
int_key = intify_attr(name)
|
||||||
if int_key is not None:
|
if int_key is not None:
|
||||||
|
|
|
@ -7,6 +7,7 @@ import typer
|
||||||
from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX
|
from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX
|
||||||
from .. import about
|
from .. import about
|
||||||
from ..util import is_package, get_minor_version, run_command
|
from ..util import is_package, get_minor_version, run_command
|
||||||
|
from ..util import is_prerelease_version
|
||||||
from ..errors import OLD_MODEL_SHORTCUTS
|
from ..errors import OLD_MODEL_SHORTCUTS
|
||||||
|
|
||||||
|
|
||||||
|
@ -74,7 +75,10 @@ def download(model: str, direct: bool = False, sdist: bool = False, *pip_args) -
|
||||||
|
|
||||||
|
|
||||||
def get_compatibility() -> dict:
|
def get_compatibility() -> dict:
|
||||||
version = get_minor_version(about.__version__)
|
if is_prerelease_version(about.__version__):
|
||||||
|
version: Optional[str] = about.__version__
|
||||||
|
else:
|
||||||
|
version = get_minor_version(about.__version__)
|
||||||
r = requests.get(about.__compatibility__)
|
r = requests.get(about.__compatibility__)
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
|
|
|
@ -123,7 +123,8 @@ def app(environ, start_response):
|
||||||
def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
||||||
"""Generate dependency parse in {'words': [], 'arcs': []} format.
|
"""Generate dependency parse in {'words': [], 'arcs': []} format.
|
||||||
|
|
||||||
doc (Doc): Document do parse.
|
orig_doc (Doc): Document to parse.
|
||||||
|
options (Dict[str, Any]): Dependency parse specific visualisation options.
|
||||||
RETURNS (dict): Generated dependency parse keyed by words and arcs.
|
RETURNS (dict): Generated dependency parse keyed by words and arcs.
|
||||||
"""
|
"""
|
||||||
doc = Doc(orig_doc.vocab).from_bytes(
|
doc = Doc(orig_doc.vocab).from_bytes(
|
||||||
|
@ -209,7 +210,7 @@ def parse_ents(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
||||||
|
|
||||||
|
|
||||||
def parse_spans(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
def parse_spans(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
||||||
"""Generate spans in [{start: i, end: i, label: 'label'}] format.
|
"""Generate spans in [{start_token: i, end_token: i, label: 'label'}] format.
|
||||||
|
|
||||||
doc (Doc): Document to parse.
|
doc (Doc): Document to parse.
|
||||||
options (Dict[str, any]): Span-specific visualisation options.
|
options (Dict[str, any]): Span-specific visualisation options.
|
||||||
|
|
|
@ -16,8 +16,8 @@ def setup_default_warnings():
|
||||||
filter_warning("ignore", error_msg="numpy.dtype size changed") # noqa
|
filter_warning("ignore", error_msg="numpy.dtype size changed") # noqa
|
||||||
filter_warning("ignore", error_msg="numpy.ufunc size changed") # noqa
|
filter_warning("ignore", error_msg="numpy.ufunc size changed") # noqa
|
||||||
|
|
||||||
# warn about entity_ruler & matcher having no patterns only once
|
# warn about entity_ruler, span_ruler & matcher having no patterns only once
|
||||||
for pipe in ["matcher", "entity_ruler"]:
|
for pipe in ["matcher", "entity_ruler", "span_ruler"]:
|
||||||
filter_warning("once", error_msg=Warnings.W036.format(name=pipe))
|
filter_warning("once", error_msg=Warnings.W036.format(name=pipe))
|
||||||
|
|
||||||
# warn once about lemmatizer without required POS
|
# warn once about lemmatizer without required POS
|
||||||
|
@ -390,7 +390,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
"consider using doc.spans instead.")
|
"consider using doc.spans instead.")
|
||||||
E106 = ("Can't find `doc._.{attr}` attribute specified in the underscore "
|
E106 = ("Can't find `doc._.{attr}` attribute specified in the underscore "
|
||||||
"settings: {opts}")
|
"settings: {opts}")
|
||||||
E107 = ("Value of `doc._.{attr}` is not JSON-serializable: {value}")
|
E107 = ("Value of custom attribute `{attr}` is not JSON-serializable: {value}")
|
||||||
E109 = ("Component '{name}' could not be run. Did you forget to "
|
E109 = ("Component '{name}' could not be run. Did you forget to "
|
||||||
"call `initialize()`?")
|
"call `initialize()`?")
|
||||||
E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
|
E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
|
||||||
|
@ -536,11 +536,12 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
E198 = ("Unable to return {n} most similar vectors for the current vectors "
|
E198 = ("Unable to return {n} most similar vectors for the current vectors "
|
||||||
"table, which contains {n_rows} vectors.")
|
"table, which contains {n_rows} vectors.")
|
||||||
E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.")
|
E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.")
|
||||||
E200 = ("Can't yet set {attr} from Span. Vote for this feature on the "
|
E200 = ("Can't set {attr} from Span.")
|
||||||
"issue tracker: http://github.com/explosion/spaCy/issues")
|
|
||||||
E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
|
E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
|
||||||
|
|
||||||
# New errors added in v3.x
|
# New errors added in v3.x
|
||||||
|
E853 = ("Unsupported component factory name '{name}'. The character '.' is "
|
||||||
|
"not permitted in factory names.")
|
||||||
E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not "
|
E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not "
|
||||||
"permit overlapping spans.")
|
"permit overlapping spans.")
|
||||||
E855 = ("Invalid {obj}: {obj} is not from the same doc.")
|
E855 = ("Invalid {obj}: {obj} is not from the same doc.")
|
||||||
|
|
|
@ -18,34 +18,23 @@ DEFAULT_CONFIG = """
|
||||||
|
|
||||||
[nlp.tokenizer]
|
[nlp.tokenizer]
|
||||||
@tokenizers = "spacy.ko.KoreanTokenizer"
|
@tokenizers = "spacy.ko.KoreanTokenizer"
|
||||||
|
mecab_args = ""
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@registry.tokenizers("spacy.ko.KoreanTokenizer")
|
@registry.tokenizers("spacy.ko.KoreanTokenizer")
|
||||||
def create_tokenizer():
|
def create_tokenizer(mecab_args: str):
|
||||||
def korean_tokenizer_factory(nlp):
|
def korean_tokenizer_factory(nlp):
|
||||||
return KoreanTokenizer(nlp.vocab)
|
return KoreanTokenizer(nlp.vocab, mecab_args=mecab_args)
|
||||||
|
|
||||||
return korean_tokenizer_factory
|
return korean_tokenizer_factory
|
||||||
|
|
||||||
|
|
||||||
class KoreanTokenizer(DummyTokenizer):
|
class KoreanTokenizer(DummyTokenizer):
|
||||||
def __init__(self, vocab: Vocab):
|
def __init__(self, vocab: Vocab, *, mecab_args: str = ""):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self._mecab = try_mecab_import() # type: ignore[func-returns-value]
|
mecab = try_mecab_import()
|
||||||
self._mecab_tokenizer = None
|
self.mecab_tokenizer = mecab.Tagger(mecab_args)
|
||||||
|
|
||||||
@property
|
|
||||||
def mecab_tokenizer(self):
|
|
||||||
# This is a property so that initializing a pipeline with blank:ko is
|
|
||||||
# possible without actually requiring mecab-ko, e.g. to run
|
|
||||||
# `spacy init vectors ko` for a pipeline that will have a different
|
|
||||||
# tokenizer in the end. The languages need to match for the vectors
|
|
||||||
# to be imported and there's no way to pass a custom config to
|
|
||||||
# `init vectors`.
|
|
||||||
if self._mecab_tokenizer is None:
|
|
||||||
self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]")
|
|
||||||
return self._mecab_tokenizer
|
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return KoreanTokenizer, (self.vocab,)
|
return KoreanTokenizer, (self.vocab,)
|
||||||
|
@ -68,13 +57,15 @@ class KoreanTokenizer(DummyTokenizer):
|
||||||
def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]:
|
def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]:
|
||||||
# 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3],
|
# 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3],
|
||||||
# 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], *
|
# 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], *
|
||||||
for node in self.mecab_tokenizer.parse(text, as_nodes=True):
|
for line in self.mecab_tokenizer.parse(text).split("\n"):
|
||||||
if node.is_eos():
|
if line == "EOS":
|
||||||
break
|
break
|
||||||
surface = node.surface
|
surface, _, expr = line.partition("\t")
|
||||||
feature = node.feature
|
features = expr.split("/")[0].split(",")
|
||||||
tag, _, expr = feature.partition(",")
|
tag = features[0]
|
||||||
lemma, _, remainder = expr.partition("/")
|
lemma = "*"
|
||||||
|
if len(features) >= 8:
|
||||||
|
lemma = features[7]
|
||||||
if lemma == "*":
|
if lemma == "*":
|
||||||
lemma = surface
|
lemma = surface
|
||||||
yield {"surface": surface, "lemma": lemma, "tag": tag}
|
yield {"surface": surface, "lemma": lemma, "tag": tag}
|
||||||
|
@ -97,20 +88,94 @@ class Korean(Language):
|
||||||
Defaults = KoreanDefaults
|
Defaults = KoreanDefaults
|
||||||
|
|
||||||
|
|
||||||
def try_mecab_import() -> None:
|
def try_mecab_import():
|
||||||
try:
|
try:
|
||||||
from natto import MeCab
|
import mecab_ko as MeCab
|
||||||
|
|
||||||
return MeCab
|
return MeCab
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires '
|
'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires '
|
||||||
"[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
|
"the python package `mecab-ko`: pip install mecab-ko"
|
||||||
"[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
|
|
||||||
"and [natto-py](https://github.com/buruzaemon/natto-py)"
|
|
||||||
) from None
|
) from None
|
||||||
|
|
||||||
|
|
||||||
|
@registry.tokenizers("spacy.KoreanNattoTokenizer.v1")
|
||||||
|
def create_natto_tokenizer():
|
||||||
|
def korean_natto_tokenizer_factory(nlp):
|
||||||
|
return KoreanNattoTokenizer(nlp.vocab)
|
||||||
|
|
||||||
|
return korean_natto_tokenizer_factory
|
||||||
|
|
||||||
|
|
||||||
|
class KoreanNattoTokenizer(DummyTokenizer):
|
||||||
|
def __init__(self, vocab: Vocab):
|
||||||
|
self.vocab = vocab
|
||||||
|
self._mecab = self._try_mecab_import() # type: ignore[func-returns-value]
|
||||||
|
self._mecab_tokenizer = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def mecab_tokenizer(self):
|
||||||
|
# This is a property so that initializing a pipeline with blank:ko is
|
||||||
|
# possible without actually requiring mecab-ko, e.g. to run
|
||||||
|
# `spacy init vectors ko` for a pipeline that will have a different
|
||||||
|
# tokenizer in the end. The languages need to match for the vectors
|
||||||
|
# to be imported and there's no way to pass a custom config to
|
||||||
|
# `init vectors`.
|
||||||
|
if self._mecab_tokenizer is None:
|
||||||
|
self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]")
|
||||||
|
return self._mecab_tokenizer
|
||||||
|
|
||||||
|
def __reduce__(self):
|
||||||
|
return KoreanNattoTokenizer, (self.vocab,)
|
||||||
|
|
||||||
|
def __call__(self, text: str) -> Doc:
|
||||||
|
dtokens = list(self.detailed_tokens(text))
|
||||||
|
surfaces = [dt["surface"] for dt in dtokens]
|
||||||
|
doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces)))
|
||||||
|
for token, dtoken in zip(doc, dtokens):
|
||||||
|
first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
|
||||||
|
token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미)
|
||||||
|
if token.tag_ in TAG_MAP:
|
||||||
|
token.pos = TAG_MAP[token.tag_][POS]
|
||||||
|
else:
|
||||||
|
token.pos = X
|
||||||
|
token.lemma_ = dtoken["lemma"]
|
||||||
|
doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
|
||||||
|
return doc
|
||||||
|
|
||||||
|
def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]:
|
||||||
|
# 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3],
|
||||||
|
# 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], *
|
||||||
|
for node in self.mecab_tokenizer.parse(text, as_nodes=True):
|
||||||
|
if node.is_eos():
|
||||||
|
break
|
||||||
|
surface = node.surface
|
||||||
|
feature = node.feature
|
||||||
|
tag, _, expr = feature.partition(",")
|
||||||
|
lemma, _, remainder = expr.partition("/")
|
||||||
|
if lemma == "*" or lemma == "":
|
||||||
|
lemma = surface
|
||||||
|
yield {"surface": surface, "lemma": lemma, "tag": tag}
|
||||||
|
|
||||||
|
def score(self, examples):
|
||||||
|
validate_examples(examples, "KoreanTokenizer.score")
|
||||||
|
return Scorer.score_tokenization(examples)
|
||||||
|
|
||||||
|
def _try_mecab_import(self):
|
||||||
|
try:
|
||||||
|
from natto import MeCab
|
||||||
|
|
||||||
|
return MeCab
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
'The Korean Natto tokenizer ("spacy.ko.KoreanNattoTokenizer") requires '
|
||||||
|
"[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
|
||||||
|
"[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
|
||||||
|
"and [natto-py](https://github.com/buruzaemon/natto-py)"
|
||||||
|
) from None
|
||||||
|
|
||||||
|
|
||||||
def check_spaces(text, tokens):
|
def check_spaces(text, tokens):
|
||||||
prev_end = -1
|
prev_end = -1
|
||||||
start = 0
|
start = 0
|
||||||
|
|
|
@ -3,7 +3,7 @@ from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES
|
||||||
|
|
||||||
|
|
||||||
_infixes = (
|
_infixes = (
|
||||||
["·", "ㆍ", "\(", "\)"]
|
["·", "ㆍ", r"\(", r"\)"]
|
||||||
+ [r"(?<=[0-9])~(?=[0-9-])"]
|
+ [r"(?<=[0-9])~(?=[0-9-])"]
|
||||||
+ LIST_QUOTES
|
+ LIST_QUOTES
|
||||||
+ BASE_TOKENIZER_INFIXES
|
+ BASE_TOKENIZER_INFIXES
|
||||||
|
|
18
spacy/lang/lg/__init__.py
Normal file
18
spacy/lang/lg/__init__.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
|
class LugandaDefaults(BaseDefaults):
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
infixes = TOKENIZER_INFIXES
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
class Luganda(Language):
|
||||||
|
lang = "lg"
|
||||||
|
Defaults = LugandaDefaults
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Luganda"]
|
17
spacy/lang/lg/examples.py
Normal file
17
spacy/lang/lg/examples.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.lg.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Mpa ebyafaayo ku byalo Nakatu ne Nkajja",
|
||||||
|
"Okuyita Ttembo kitegeeza kugwa ddalu",
|
||||||
|
"Ekifumu kino kyali kya mulimu ki?",
|
||||||
|
"Ekkovu we liyise wayitibwa mukululo",
|
||||||
|
"Akola mulimu ki oguvaamu ssente?",
|
||||||
|
"Emisumaali egikomerera embaawo giyitibwa nninga",
|
||||||
|
"Abooluganda ab’emmamba ababiri",
|
||||||
|
"Ekisaawe ky'ebyenjigiriza kya mugaso nnyo",
|
||||||
|
]
|
95
spacy/lang/lg/lex_attrs.py
Normal file
95
spacy/lang/lg/lex_attrs.py
Normal file
|
@ -0,0 +1,95 @@
|
||||||
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
_num_words = [
|
||||||
|
"nnooti", # Zero
|
||||||
|
"zeero", # zero
|
||||||
|
"emu", # one
|
||||||
|
"bbiri", # two
|
||||||
|
"ssatu", # three
|
||||||
|
"nnya", # four
|
||||||
|
"ttaano", # five
|
||||||
|
"mukaaga", # six
|
||||||
|
"musanvu", # seven
|
||||||
|
"munaana", # eight
|
||||||
|
"mwenda", # nine
|
||||||
|
"kkumi", # ten
|
||||||
|
"kkumi n'emu", # eleven
|
||||||
|
"kkumi na bbiri", # twelve
|
||||||
|
"kkumi na ssatu", # thirteen
|
||||||
|
"kkumi na nnya", # forteen
|
||||||
|
"kkumi na ttaano", # fifteen
|
||||||
|
"kkumi na mukaaga", # sixteen
|
||||||
|
"kkumi na musanvu", # seventeen
|
||||||
|
"kkumi na munaana", # eighteen
|
||||||
|
"kkumi na mwenda", # nineteen
|
||||||
|
"amakumi abiri", # twenty
|
||||||
|
"amakumi asatu", # thirty
|
||||||
|
"amakumi ana", # forty
|
||||||
|
"amakumi ataano", # fifty
|
||||||
|
"nkaaga", # sixty
|
||||||
|
"nsanvu", # seventy
|
||||||
|
"kinaana", # eighty
|
||||||
|
"kyenda", # ninety
|
||||||
|
"kikumi", # hundred
|
||||||
|
"lukumi", # thousand
|
||||||
|
"kakadde", # million
|
||||||
|
"kawumbi", # billion
|
||||||
|
"kase", # trillion
|
||||||
|
"katabalika", # quadrillion
|
||||||
|
"keesedde", # gajillion
|
||||||
|
"kafukunya", # bazillion
|
||||||
|
"ekisooka", # first
|
||||||
|
"ekyokubiri", # second
|
||||||
|
"ekyokusatu", # third
|
||||||
|
"ekyokuna", # fourth
|
||||||
|
"ekyokutaano", # fifith
|
||||||
|
"ekyomukaaga", # sixth
|
||||||
|
"ekyomusanvu", # seventh
|
||||||
|
"eky'omunaana", # eighth
|
||||||
|
"ekyomwenda", # nineth
|
||||||
|
"ekyekkumi", # tenth
|
||||||
|
"ekyekkumi n'ekimu", # eleventh
|
||||||
|
"ekyekkumi n'ebibiri", # twelveth
|
||||||
|
"ekyekkumi n'ebisatu", # thirteenth
|
||||||
|
"ekyekkumi n'ebina", # fourteenth
|
||||||
|
"ekyekkumi n'ebitaano", # fifteenth
|
||||||
|
"ekyekkumi n'omukaaga", # sixteenth
|
||||||
|
"ekyekkumi n'omusanvu", # seventeenth
|
||||||
|
"ekyekkumi n'omunaana", # eigteenth
|
||||||
|
"ekyekkumi n'omwenda", # nineteenth
|
||||||
|
"ekyamakumi abiri", # twentieth
|
||||||
|
"ekyamakumi asatu", # thirtieth
|
||||||
|
"ekyamakumi ana", # fortieth
|
||||||
|
"ekyamakumi ataano", # fiftieth
|
||||||
|
"ekyenkaaga", # sixtieth
|
||||||
|
"ekyensanvu", # seventieth
|
||||||
|
"ekyekinaana", # eightieth
|
||||||
|
"ekyekyenda", # ninetieth
|
||||||
|
"ekyekikumi", # hundredth
|
||||||
|
"ekyolukumi", # thousandth
|
||||||
|
"ekyakakadde", # millionth
|
||||||
|
"ekyakawumbi", # billionth
|
||||||
|
"ekyakase", # trillionth
|
||||||
|
"ekyakatabalika", # quadrillionth
|
||||||
|
"ekyakeesedde", # gajillionth
|
||||||
|
"ekyakafukunya", # bazillionth
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def like_num(text):
|
||||||
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
|
text = text[1:]
|
||||||
|
text = text.replace(",", "").replace(".", "")
|
||||||
|
if text.isdigit():
|
||||||
|
return True
|
||||||
|
if text.count("/") == 1:
|
||||||
|
num, denom = text.split("/")
|
||||||
|
if num.isdigit() and denom.isdigit():
|
||||||
|
return True
|
||||||
|
text_lower = text.lower()
|
||||||
|
if text_lower in _num_words:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
LEX_ATTRS = {LIKE_NUM: like_num}
|
19
spacy/lang/lg/punctuation.py
Normal file
19
spacy/lang/lg/punctuation.py
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
|
||||||
|
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
|
||||||
|
|
||||||
|
_infixes = (
|
||||||
|
LIST_ELLIPSES
|
||||||
|
+ LIST_ICONS
|
||||||
|
+ [
|
||||||
|
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||||
|
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||||
|
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||||
|
),
|
||||||
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
|
||||||
|
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_INFIXES = _infixes
|
19
spacy/lang/lg/stop_words.py
Normal file
19
spacy/lang/lg/stop_words.py
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
|
abadde abalala abamu abangi abava ajja ali alina ani anti ateekeddwa atewamu
|
||||||
|
atya awamu aweebwa ayinza ba baali babadde babalina bajja
|
||||||
|
bajjanewankubade bali balina bandi bangi bano bateekeddwa baweebwa bayina bebombi beera bibye
|
||||||
|
bimu bingi bino bo bokka bonna buli bulijjo bulungi bwabwe bwaffe bwayo bwe bwonna bya byabwe
|
||||||
|
byaffe byebimu byonna ddaa ddala ddi e ebimu ebiri ebweruobulungi ebyo edda ejja ekirala ekyo
|
||||||
|
endala engeri ennyo era erimu erina ffe ffenna ga gujja gumu gunno guno gwa gwe kaseera kati
|
||||||
|
kennyini ki kiki kikino kikye kikyo kino kirungi kki ku kubangabyombi kubangaolwokuba kudda
|
||||||
|
kuva kuwa kwegamba kyaffe kye kyekimuoyo kyekyo kyonna leero liryo lwa lwaki lyabwezaabwe
|
||||||
|
lyaffe lyange mbadde mingi mpozzi mu mulinaoyina munda mwegyabwe nolwekyo nabadde nabo nandiyagadde
|
||||||
|
nandiye nanti naye ne nedda neera nga nnyingi nnyini nnyinza nnyo nti nyinza nze oba ojja okudda
|
||||||
|
okugenda okuggyako okutuusa okuva okuwa oli olina oluvannyuma olwekyobuva omuli ono osobola otya
|
||||||
|
oyina oyo seetaaga si sinakindi singa talina tayina tebaali tebaalina tebayina terina tetulina
|
||||||
|
tetuteekeddwa tewali teyalina teyayina tolina tu tuyina tulina tuyina twafuna twetaaga wa wabula
|
||||||
|
wabweru wadde waggulunnina wakati waliwobangi waliyo wandi wange wano wansi weebwa yabadde yaffe
|
||||||
|
ye yenna yennyini yina yonna ziba zijja zonna
|
||||||
|
""".split()
|
||||||
|
)
|
|
@ -40,6 +40,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
span_label = doc.vocab.strings.add("NP")
|
span_label = doc.vocab.strings.add("NP")
|
||||||
|
|
||||||
# Only NOUNS and PRONOUNS matter
|
# Only NOUNS and PRONOUNS matter
|
||||||
|
end_span = -1
|
||||||
for i, word in enumerate(filter(lambda x: x.pos in [PRON, NOUN], doclike)):
|
for i, word in enumerate(filter(lambda x: x.pos in [PRON, NOUN], doclike)):
|
||||||
# For NOUNS
|
# For NOUNS
|
||||||
# Pick children from syntactic parse (only those with certain dependencies)
|
# Pick children from syntactic parse (only those with certain dependencies)
|
||||||
|
@ -58,15 +59,17 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
children_i = [c.i for c in children] + [word.i]
|
children_i = [c.i for c in children] + [word.i]
|
||||||
|
|
||||||
start_span = min(children_i)
|
start_span = min(children_i)
|
||||||
end_span = max(children_i) + 1
|
if start_span >= end_span:
|
||||||
yield start_span, end_span, span_label
|
end_span = max(children_i) + 1
|
||||||
|
yield start_span, end_span, span_label
|
||||||
|
|
||||||
# PRONOUNS only if it is the subject of a verb
|
# PRONOUNS only if it is the subject of a verb
|
||||||
elif word.pos == PRON:
|
elif word.pos == PRON:
|
||||||
if word.dep in pronoun_deps:
|
if word.dep in pronoun_deps:
|
||||||
start_span = word.i
|
start_span = word.i
|
||||||
end_span = word.i + 1
|
if start_span >= end_span:
|
||||||
yield start_span, end_span, span_label
|
end_span = word.i + 1
|
||||||
|
yield start_span, end_span, span_label
|
||||||
|
|
||||||
|
|
||||||
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
||||||
|
|
|
@ -28,7 +28,7 @@ class Russian(Language):
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={
|
default_config={
|
||||||
"model": None,
|
"model": None,
|
||||||
"mode": "pymorphy2",
|
"mode": "pymorphy3",
|
||||||
"overwrite": False,
|
"overwrite": False,
|
||||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
},
|
},
|
||||||
|
|
|
@ -19,7 +19,7 @@ class RussianLemmatizer(Lemmatizer):
|
||||||
model: Optional[Model],
|
model: Optional[Model],
|
||||||
name: str = "lemmatizer",
|
name: str = "lemmatizer",
|
||||||
*,
|
*,
|
||||||
mode: str = "pymorphy2",
|
mode: str = "pymorphy3",
|
||||||
overwrite: bool = False,
|
overwrite: bool = False,
|
||||||
scorer: Optional[Callable] = lemmatizer_score,
|
scorer: Optional[Callable] = lemmatizer_score,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
@ -33,6 +33,16 @@ class RussianLemmatizer(Lemmatizer):
|
||||||
) from None
|
) from None
|
||||||
if getattr(self, "_morph", None) is None:
|
if getattr(self, "_morph", None) is None:
|
||||||
self._morph = MorphAnalyzer()
|
self._morph = MorphAnalyzer()
|
||||||
|
elif mode == "pymorphy3":
|
||||||
|
try:
|
||||||
|
from pymorphy3 import MorphAnalyzer
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"The Russian lemmatizer mode 'pymorphy3' requires the "
|
||||||
|
"pymorphy3 library. Install it with: pip install pymorphy3"
|
||||||
|
) from None
|
||||||
|
if getattr(self, "_morph", None) is None:
|
||||||
|
self._morph = MorphAnalyzer()
|
||||||
super().__init__(
|
super().__init__(
|
||||||
vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
)
|
)
|
||||||
|
@ -104,6 +114,9 @@ class RussianLemmatizer(Lemmatizer):
|
||||||
return [analyses[0].normal_form]
|
return [analyses[0].normal_form]
|
||||||
return [string]
|
return [string]
|
||||||
|
|
||||||
|
def pymorphy3_lemmatize(self, token: Token) -> List[str]:
|
||||||
|
return self.pymorphy2_lemmatize(token)
|
||||||
|
|
||||||
|
|
||||||
def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:
|
def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:
|
||||||
gram_map = {
|
gram_map = {
|
||||||
|
|
|
@ -1,9 +1,17 @@
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from ...language import Language, BaseDefaults
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class SlovenianDefaults(BaseDefaults):
|
class SlovenianDefaults(BaseDefaults):
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
prefixes = TOKENIZER_PREFIXES
|
||||||
|
infixes = TOKENIZER_INFIXES
|
||||||
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
class Slovenian(Language):
|
class Slovenian(Language):
|
||||||
|
|
145
spacy/lang/sl/lex_attrs.py
Normal file
145
spacy/lang/sl/lex_attrs.py
Normal file
|
@ -0,0 +1,145 @@
|
||||||
|
from ...attrs import LIKE_NUM
|
||||||
|
from ...attrs import IS_CURRENCY
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
|
||||||
|
_num_words = set(
|
||||||
|
"""
|
||||||
|
nula ničla nič ena dva tri štiri pet šest sedem osem
|
||||||
|
devet deset enajst dvanajst trinajst štirinajst petnajst
|
||||||
|
šestnajst sedemnajst osemnajst devetnajst dvajset trideset štirideset
|
||||||
|
petdeset šestdest sedemdeset osemdeset devedeset sto tisoč
|
||||||
|
milijon bilijon trilijon kvadrilijon nešteto
|
||||||
|
|
||||||
|
en eden enega enemu ennem enim enih enima enimi ene eni eno
|
||||||
|
dveh dvema dvem dvoje trije treh trem tremi troje štirje štirih štirim štirimi
|
||||||
|
petih petim petimi šestih šestim šestimi sedmih sedmim sedmimi osmih osmim osmimi
|
||||||
|
devetih devetim devetimi desetih desetim desetimi enajstih enajstim enajstimi
|
||||||
|
dvanajstih dvanajstim dvanajstimi trinajstih trinajstim trinajstimi
|
||||||
|
šestnajstih šestnajstim šestnajstimi petnajstih petnajstim petnajstimi
|
||||||
|
sedemnajstih sedemnajstim sedemnajstimi osemnajstih osemnajstim osemnajstimi
|
||||||
|
devetnajstih devetnajstim devetnajstimi dvajsetih dvajsetim dvajsetimi
|
||||||
|
""".split()
|
||||||
|
)
|
||||||
|
|
||||||
|
_ordinal_words = set(
|
||||||
|
"""
|
||||||
|
prvi drugi tretji četrti peti šesti sedmi osmi
|
||||||
|
deveti deseti enajsti dvanajsti trinajsti štirinajsti
|
||||||
|
petnajsti šestnajsti sedemnajsti osemnajsti devetnajsti
|
||||||
|
dvajseti trideseti štirideseti petdeseti šestdeseti sedemdeseti
|
||||||
|
osemdeseti devetdeseti stoti tisoči milijonti bilijonti
|
||||||
|
trilijonti kvadrilijonti nešteti
|
||||||
|
|
||||||
|
prva druga tretja četrta peta šesta sedma osma
|
||||||
|
deveta deseta enajsta dvanajsta trinajsta štirnajsta
|
||||||
|
petnajsta šestnajsta sedemnajsta osemnajsta devetnajsta
|
||||||
|
dvajseta trideseta štirideseta petdeseta šestdeseta sedemdeseta
|
||||||
|
osemdeseta devetdeseta stota tisoča milijonta bilijonta
|
||||||
|
trilijonta kvadrilijonta nešteta
|
||||||
|
|
||||||
|
prvo drugo tretje četrto peto šestro sedmo osmo
|
||||||
|
deveto deseto enajsto dvanajsto trinajsto štirnajsto
|
||||||
|
petnajsto šestnajsto sedemnajsto osemnajsto devetnajsto
|
||||||
|
dvajseto trideseto štirideseto petdeseto šestdeseto sedemdeseto
|
||||||
|
osemdeseto devetdeseto stoto tisočo milijonto bilijonto
|
||||||
|
trilijonto kvadrilijonto nešteto
|
||||||
|
|
||||||
|
prvega drugega tretjega četrtega petega šestega sedmega osmega
|
||||||
|
devega desetega enajstega dvanajstega trinajstega štirnajstega
|
||||||
|
petnajstega šestnajstega sedemnajstega osemnajstega devetnajstega
|
||||||
|
dvajsetega tridesetega štiridesetega petdesetega šestdesetega sedemdesetega
|
||||||
|
osemdesetega devetdesetega stotega tisočega milijontega bilijontega
|
||||||
|
trilijontega kvadrilijontega neštetega
|
||||||
|
|
||||||
|
prvemu drugemu tretjemu četrtemu petemu šestemu sedmemu osmemu devetemu desetemu
|
||||||
|
enajstemu dvanajstemu trinajstemu štirnajstemu petnajstemu šestnajstemu sedemnajstemu
|
||||||
|
osemnajstemu devetnajstemu dvajsetemu tridesetemu štiridesetemu petdesetemu šestdesetemu
|
||||||
|
sedemdesetemu osemdesetemu devetdesetemu stotemu tisočemu milijontemu bilijontemu
|
||||||
|
trilijontemu kvadrilijontemu neštetemu
|
||||||
|
|
||||||
|
prvem drugem tretjem četrtem petem šestem sedmem osmem devetem desetem
|
||||||
|
enajstem dvanajstem trinajstem štirnajstem petnajstem šestnajstem sedemnajstem
|
||||||
|
osemnajstem devetnajstem dvajsetem tridesetem štiridesetem petdesetem šestdesetem
|
||||||
|
sedemdesetem osemdesetem devetdesetem stotem tisočem milijontem bilijontem
|
||||||
|
trilijontem kvadrilijontem neštetem
|
||||||
|
|
||||||
|
prvim drugim tretjim četrtim petim šestim sedtim osmim devetim desetim
|
||||||
|
enajstim dvanajstim trinajstim štirnajstim petnajstim šestnajstim sedemnajstim
|
||||||
|
osemnajstim devetnajstim dvajsetim tridesetim štiridesetim petdesetim šestdesetim
|
||||||
|
sedemdesetim osemdesetim devetdesetim stotim tisočim milijontim bilijontim
|
||||||
|
trilijontim kvadrilijontim neštetim
|
||||||
|
|
||||||
|
prvih drugih tretjih četrthih petih šestih sedmih osmih deveth desetih
|
||||||
|
enajstih dvanajstih trinajstih štirnajstih petnajstih šestnajstih sedemnajstih
|
||||||
|
osemnajstih devetnajstih dvajsetih tridesetih štiridesetih petdesetih šestdesetih
|
||||||
|
sedemdesetih osemdesetih devetdesetih stotih tisočih milijontih bilijontih
|
||||||
|
trilijontih kvadrilijontih nešteth
|
||||||
|
|
||||||
|
prvima drugima tretjima četrtima petima šestima sedmima osmima devetima desetima
|
||||||
|
enajstima dvanajstima trinajstima štirnajstima petnajstima šestnajstima sedemnajstima
|
||||||
|
osemnajstima devetnajstima dvajsetima tridesetima štiridesetima petdesetima šestdesetima
|
||||||
|
sedemdesetima osemdesetima devetdesetima stotima tisočima milijontima bilijontima
|
||||||
|
trilijontima kvadrilijontima neštetima
|
||||||
|
|
||||||
|
prve druge četrte pete šeste sedme osme devete desete
|
||||||
|
enajste dvanajste trinajste štirnajste petnajste šestnajste sedemnajste
|
||||||
|
osemnajste devetnajste dvajsete tridesete štiridesete petdesete šestdesete
|
||||||
|
sedemdesete osemdesete devetdesete stote tisoče milijonte bilijonte
|
||||||
|
trilijonte kvadrilijonte neštete
|
||||||
|
|
||||||
|
prvimi drugimi tretjimi četrtimi petimi šestimi sedtimi osmimi devetimi desetimi
|
||||||
|
enajstimi dvanajstimi trinajstimi štirnajstimi petnajstimi šestnajstimi sedemnajstimi
|
||||||
|
osemnajstimi devetnajstimi dvajsetimi tridesetimi štiridesetimi petdesetimi šestdesetimi
|
||||||
|
sedemdesetimi osemdesetimi devetdesetimi stotimi tisočimi milijontimi bilijontimi
|
||||||
|
trilijontimi kvadrilijontimi neštetimi
|
||||||
|
""".split()
|
||||||
|
)
|
||||||
|
|
||||||
|
_currency_words = set(
|
||||||
|
"""
|
||||||
|
evro evra evru evrom evrov evroma evrih evrom evre evri evr eur
|
||||||
|
cent centa centu cenom centov centoma centih centom cente centi
|
||||||
|
dolar dolarja dolarji dolarju dolarjem dolarjev dolarjema dolarjih dolarje usd
|
||||||
|
tolar tolarja tolarji tolarju tolarjem tolarjev tolarjema tolarjih tolarje tol
|
||||||
|
dinar dinarja dinarji dinarju dinarjem dinarjev dinarjema dinarjih dinarje din
|
||||||
|
funt funta funti funtu funtom funtov funtoma funtih funte gpb
|
||||||
|
forint forinta forinti forintu forintom forintov forintoma forintih forinte
|
||||||
|
zlot zlota zloti zlotu zlotom zlotov zlotoma zlotih zlote
|
||||||
|
rupij rupija rupiji rupiju rupijem rupijev rupijema rupijih rupije
|
||||||
|
jen jena jeni jenu jenom jenov jenoma jenih jene
|
||||||
|
kuna kuni kune kuno kun kunama kunah kunam kunami
|
||||||
|
marka marki marke markama markah markami
|
||||||
|
""".split()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def like_num(text):
|
||||||
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
|
text = text[1:]
|
||||||
|
text = text.replace(",", "").replace(".", "")
|
||||||
|
if text.isdigit():
|
||||||
|
return True
|
||||||
|
if text.count("/") == 1:
|
||||||
|
num, denom = text.split("/")
|
||||||
|
if num.isdigit() and denom.isdigit():
|
||||||
|
return True
|
||||||
|
text_lower = text.lower()
|
||||||
|
if text_lower in _num_words:
|
||||||
|
return True
|
||||||
|
if text_lower in _ordinal_words:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def is_currency(text):
|
||||||
|
text_lower = text.lower()
|
||||||
|
if text in _currency_words:
|
||||||
|
return True
|
||||||
|
for char in text:
|
||||||
|
if unicodedata.category(char) != "Sc":
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
LEX_ATTRS = {LIKE_NUM: like_num, IS_CURRENCY: is_currency}
|
84
spacy/lang/sl/punctuation.py
Normal file
84
spacy/lang/sl/punctuation.py
Normal file
|
@ -0,0 +1,84 @@
|
||||||
|
from ..char_classes import (
|
||||||
|
LIST_ELLIPSES,
|
||||||
|
LIST_ICONS,
|
||||||
|
HYPHENS,
|
||||||
|
LIST_PUNCT,
|
||||||
|
LIST_QUOTES,
|
||||||
|
CURRENCY,
|
||||||
|
UNITS,
|
||||||
|
PUNCT,
|
||||||
|
LIST_CURRENCY,
|
||||||
|
CONCAT_QUOTES,
|
||||||
|
)
|
||||||
|
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
|
||||||
|
from ..char_classes import merge_chars
|
||||||
|
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
|
||||||
|
|
||||||
|
|
||||||
|
INCLUDE_SPECIAL = ["\\+", "\\/", "\\•", "\\¯", "\\=", "\\×"] + HYPHENS.split("|")
|
||||||
|
|
||||||
|
_prefixes = INCLUDE_SPECIAL + BASE_TOKENIZER_PREFIXES
|
||||||
|
|
||||||
|
_suffixes = (
|
||||||
|
INCLUDE_SPECIAL
|
||||||
|
+ LIST_PUNCT
|
||||||
|
+ LIST_ELLIPSES
|
||||||
|
+ LIST_QUOTES
|
||||||
|
+ LIST_ICONS
|
||||||
|
+ [
|
||||||
|
r"(?<=°[FfCcKk])\.",
|
||||||
|
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
||||||
|
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||||
|
r"(?<=[{al}{e}{p}(?:{q})])\.".format(
|
||||||
|
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
|
||||||
|
),
|
||||||
|
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
||||||
|
# split initials like J.K. Rowling
|
||||||
|
r"(?<=[A-Z]\.)(?:[A-Z].)",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
# a list of all suffixes following a hyphen that are shouldn't split (eg. BTC-jev)
|
||||||
|
# source: Obeliks tokenizer - https://github.com/clarinsi/obeliks/blob/master/obeliks/res/TokRulesPart1.txt
|
||||||
|
CONCAT_QUOTES = CONCAT_QUOTES.replace("'", "")
|
||||||
|
HYPHENS_PERMITTED = (
|
||||||
|
"((a)|(evemu)|(evskega)|(i)|(jevega)|(jevska)|(jevskimi)|(jinemu)|(oma)|(ovim)|"
|
||||||
|
"(ovski)|(e)|(evi)|(evskem)|(ih)|(jevem)|(jevske)|(jevsko)|(jini)|(ov)|(ovima)|"
|
||||||
|
"(ovskih)|(em)|(evih)|(evskemu)|(ja)|(jevemu)|(jevskega)|(ji)|(jinih)|(ova)|"
|
||||||
|
"(ovimi)|(ovskim)|(ema)|(evim)|(evski)|(je)|(jevi)|(jevskem)|(jih)|(jinim)|"
|
||||||
|
"(ove)|(ovo)|(ovskima)|(ev)|(evima)|(evskih)|(jem)|(jevih)|(jevskemu)|(jin)|"
|
||||||
|
"(jinima)|(ovega)|(ovska)|(ovskimi)|(eva)|(evimi)|(evskim)|(jema)|(jevim)|"
|
||||||
|
"(jevski)|(jina)|(jinimi)|(ovem)|(ovske)|(ovsko)|(eve)|(evo)|(evskima)|(jev)|"
|
||||||
|
"(jevima)|(jevskih)|(jine)|(jino)|(ovemu)|(ovskega)|(u)|(evega)|(evska)|"
|
||||||
|
"(evskimi)|(jeva)|(jevimi)|(jevskim)|(jinega)|(ju)|(ovi)|(ovskem)|(evem)|"
|
||||||
|
"(evske)|(evsko)|(jeve)|(jevo)|(jevskima)|(jinem)|(om)|(ovih)|(ovskemu)|"
|
||||||
|
"(ovec)|(ovca)|(ovcu)|(ovcem)|(ovcev)|(ovcema)|(ovcih)|(ovci)|(ovce)|(ovcimi)|"
|
||||||
|
"(evec)|(evca)|(evcu)|(evcem)|(evcev)|(evcema)|(evcih)|(evci)|(evce)|(evcimi)|"
|
||||||
|
"(jevec)|(jevca)|(jevcu)|(jevcem)|(jevcev)|(jevcema)|(jevcih)|(jevci)|(jevce)|"
|
||||||
|
"(jevcimi)|(ovka)|(ovke)|(ovki)|(ovko)|(ovk)|(ovkama)|(ovkah)|(ovkam)|(ovkami)|"
|
||||||
|
"(evka)|(evke)|(evki)|(evko)|(evk)|(evkama)|(evkah)|(evkam)|(evkami)|(jevka)|"
|
||||||
|
"(jevke)|(jevki)|(jevko)|(jevk)|(jevkama)|(jevkah)|(jevkam)|(jevkami)|(timi)|"
|
||||||
|
"(im)|(ima)|(a)|(imi)|(e)|(o)|(ega)|(ti)|(em)|(tih)|(emu)|(tim)|(i)|(tima)|"
|
||||||
|
"(ih)|(ta)|(te)|(to)|(tega)|(tem)|(temu))"
|
||||||
|
)
|
||||||
|
|
||||||
|
_infixes = (
|
||||||
|
LIST_ELLIPSES
|
||||||
|
+ LIST_ICONS
|
||||||
|
+ [
|
||||||
|
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||||
|
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||||
|
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||||
|
),
|
||||||
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}0-9])(?:{h})(?!{hp}$)(?=[{a}])".format(
|
||||||
|
a=ALPHA, h=HYPHENS, hp=HYPHENS_PERMITTED
|
||||||
|
),
|
||||||
|
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_PREFIXES = _prefixes
|
||||||
|
TOKENIZER_SUFFIXES = _suffixes
|
||||||
|
TOKENIZER_INFIXES = _infixes
|
|
@ -1,326 +1,84 @@
|
||||||
# Source: https://github.com/stopwords-iso/stopwords-sl
|
# Source: https://github.com/stopwords-iso/stopwords-sl
|
||||||
# Removed various words that are not normally considered stop words, such as months.
|
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
a
|
a ali
|
||||||
ali
|
|
||||||
b
|
b bi bil bila bile bili bilo biti blizu bo bodo bojo bolj bom bomo
|
||||||
bi
|
boste bova boš brez
|
||||||
bil
|
|
||||||
bila
|
c cel cela celi celo
|
||||||
bile
|
|
||||||
bili
|
č če često četrta četrtek četrti četrto čez čigav
|
||||||
bilo
|
|
||||||
biti
|
d da daleč dan danes datum deset deseta deseti deseto devet
|
||||||
blizu
|
deveta deveti deveto do dober dobra dobri dobro dokler dol dolg
|
||||||
bo
|
dolga dolgi dovolj drug druga drugi drugo dva dve
|
||||||
bodo
|
|
||||||
bolj
|
e eden en ena ene eni enkrat eno etc.
|
||||||
bom
|
|
||||||
bomo
|
|
||||||
boste
|
|
||||||
bova
|
|
||||||
boš
|
|
||||||
brez
|
|
||||||
c
|
|
||||||
cel
|
|
||||||
cela
|
|
||||||
celi
|
|
||||||
celo
|
|
||||||
d
|
|
||||||
da
|
|
||||||
daleč
|
|
||||||
dan
|
|
||||||
danes
|
|
||||||
do
|
|
||||||
dober
|
|
||||||
dobra
|
|
||||||
dobri
|
|
||||||
dobro
|
|
||||||
dokler
|
|
||||||
dol
|
|
||||||
dovolj
|
|
||||||
e
|
|
||||||
eden
|
|
||||||
en
|
|
||||||
ena
|
|
||||||
ene
|
|
||||||
eni
|
|
||||||
enkrat
|
|
||||||
eno
|
|
||||||
etc.
|
|
||||||
f
|
f
|
||||||
g
|
|
||||||
g.
|
g g. ga ga. gor gospa gospod
|
||||||
ga
|
|
||||||
ga.
|
h halo
|
||||||
gor
|
|
||||||
gospa
|
i idr. ii iii in iv ix iz
|
||||||
gospod
|
|
||||||
h
|
j jaz je ji jih jim jo jutri
|
||||||
halo
|
|
||||||
i
|
k kadarkoli kaj kajti kako kakor kamor kamorkoli kar karkoli
|
||||||
idr.
|
katerikoli kdaj kdo kdorkoli ker ki kje kjer kjerkoli
|
||||||
ii
|
ko koder koderkoli koga komu kot kratek kratka kratke kratki
|
||||||
iii
|
|
||||||
in
|
l lahka lahke lahki lahko le lep lepa lepe lepi lepo leto
|
||||||
iv
|
|
||||||
ix
|
m majhen majhna majhni malce malo manj me med medtem mene
|
||||||
iz
|
mesec mi midva midve mnogo moj moja moje mora morajo moram
|
||||||
j
|
moramo morate moraš morem mu
|
||||||
jaz
|
|
||||||
je
|
n na nad naj najina najino najmanj naju največ nam narobe
|
||||||
ji
|
nas nato nazaj naš naša naše ne nedavno nedelja nek neka
|
||||||
jih
|
nekaj nekatere nekateri nekatero nekdo neke nekega neki
|
||||||
jim
|
nekje neko nekoga nekoč ni nikamor nikdar nikjer nikoli
|
||||||
jo
|
nič nje njega njegov njegova njegovo njej njemu njen
|
||||||
k
|
njena njeno nji njih njihov njihova njihovo njiju njim
|
||||||
kadarkoli
|
njo njun njuna njuno no nocoj npr.
|
||||||
kaj
|
|
||||||
kajti
|
o ob oba obe oboje od odprt odprta odprti okoli on
|
||||||
kako
|
onadva one oni onidve osem osma osmi osmo oz.
|
||||||
kakor
|
|
||||||
kamor
|
p pa pet peta petek peti peto po pod pogosto poleg poln
|
||||||
kamorkoli
|
polna polni polno ponavadi ponedeljek ponovno potem
|
||||||
kar
|
povsod pozdravljen pozdravljeni prav prava prave pravi
|
||||||
karkoli
|
pravo prazen prazna prazno prbl. precej pred prej preko
|
||||||
katerikoli
|
pri pribl. približno primer pripravljen pripravljena
|
||||||
kdaj
|
pripravljeni proti prva prvi prvo
|
||||||
kdo
|
|
||||||
kdorkoli
|
r ravno redko res reč
|
||||||
ker
|
|
||||||
ki
|
s saj sam sama same sami samo se sebe sebi sedaj sedem
|
||||||
kje
|
sedma sedmi sedmo sem seveda si sicer skoraj skozi slab sm
|
||||||
kjer
|
so sobota spet sreda srednja srednji sta ste stran stvar sva
|
||||||
kjerkoli
|
|
||||||
ko
|
š šest šesta šesti šesto štiri
|
||||||
koderkoli
|
|
||||||
koga
|
t ta tak taka take taki tako takoj tam te tebe tebi tega
|
||||||
komu
|
težak težka težki težko ti tista tiste tisti tisto tj.
|
||||||
kot
|
tja to toda torek tretja tretje tretji tri tu tudi tukaj
|
||||||
l
|
tvoj tvoja tvoje
|
||||||
le
|
|
||||||
lep
|
|
||||||
lepa
|
|
||||||
lepe
|
|
||||||
lepi
|
|
||||||
lepo
|
|
||||||
m
|
|
||||||
manj
|
|
||||||
me
|
|
||||||
med
|
|
||||||
medtem
|
|
||||||
mene
|
|
||||||
mi
|
|
||||||
midva
|
|
||||||
midve
|
|
||||||
mnogo
|
|
||||||
moj
|
|
||||||
moja
|
|
||||||
moje
|
|
||||||
mora
|
|
||||||
morajo
|
|
||||||
moram
|
|
||||||
moramo
|
|
||||||
morate
|
|
||||||
moraš
|
|
||||||
morem
|
|
||||||
mu
|
|
||||||
n
|
|
||||||
na
|
|
||||||
nad
|
|
||||||
naj
|
|
||||||
najina
|
|
||||||
najino
|
|
||||||
najmanj
|
|
||||||
naju
|
|
||||||
največ
|
|
||||||
nam
|
|
||||||
nas
|
|
||||||
nato
|
|
||||||
nazaj
|
|
||||||
naš
|
|
||||||
naša
|
|
||||||
naše
|
|
||||||
ne
|
|
||||||
nedavno
|
|
||||||
nek
|
|
||||||
neka
|
|
||||||
nekaj
|
|
||||||
nekatere
|
|
||||||
nekateri
|
|
||||||
nekatero
|
|
||||||
nekdo
|
|
||||||
neke
|
|
||||||
nekega
|
|
||||||
neki
|
|
||||||
nekje
|
|
||||||
neko
|
|
||||||
nekoga
|
|
||||||
nekoč
|
|
||||||
ni
|
|
||||||
nikamor
|
|
||||||
nikdar
|
|
||||||
nikjer
|
|
||||||
nikoli
|
|
||||||
nič
|
|
||||||
nje
|
|
||||||
njega
|
|
||||||
njegov
|
|
||||||
njegova
|
|
||||||
njegovo
|
|
||||||
njej
|
|
||||||
njemu
|
|
||||||
njen
|
|
||||||
njena
|
|
||||||
njeno
|
|
||||||
nji
|
|
||||||
njih
|
|
||||||
njihov
|
|
||||||
njihova
|
|
||||||
njihovo
|
|
||||||
njiju
|
|
||||||
njim
|
|
||||||
njo
|
|
||||||
njun
|
|
||||||
njuna
|
|
||||||
njuno
|
|
||||||
no
|
|
||||||
nocoj
|
|
||||||
npr.
|
|
||||||
o
|
|
||||||
ob
|
|
||||||
oba
|
|
||||||
obe
|
|
||||||
oboje
|
|
||||||
od
|
|
||||||
okoli
|
|
||||||
on
|
|
||||||
onadva
|
|
||||||
one
|
|
||||||
oni
|
|
||||||
onidve
|
|
||||||
oz.
|
|
||||||
p
|
|
||||||
pa
|
|
||||||
po
|
|
||||||
pod
|
|
||||||
pogosto
|
|
||||||
poleg
|
|
||||||
ponavadi
|
|
||||||
ponovno
|
|
||||||
potem
|
|
||||||
povsod
|
|
||||||
prbl.
|
|
||||||
precej
|
|
||||||
pred
|
|
||||||
prej
|
|
||||||
preko
|
|
||||||
pri
|
|
||||||
pribl.
|
|
||||||
približno
|
|
||||||
proti
|
|
||||||
r
|
|
||||||
redko
|
|
||||||
res
|
|
||||||
s
|
|
||||||
saj
|
|
||||||
sam
|
|
||||||
sama
|
|
||||||
same
|
|
||||||
sami
|
|
||||||
samo
|
|
||||||
se
|
|
||||||
sebe
|
|
||||||
sebi
|
|
||||||
sedaj
|
|
||||||
sem
|
|
||||||
seveda
|
|
||||||
si
|
|
||||||
sicer
|
|
||||||
skoraj
|
|
||||||
skozi
|
|
||||||
smo
|
|
||||||
so
|
|
||||||
spet
|
|
||||||
sta
|
|
||||||
ste
|
|
||||||
sva
|
|
||||||
t
|
|
||||||
ta
|
|
||||||
tak
|
|
||||||
taka
|
|
||||||
take
|
|
||||||
taki
|
|
||||||
tako
|
|
||||||
takoj
|
|
||||||
tam
|
|
||||||
te
|
|
||||||
tebe
|
|
||||||
tebi
|
|
||||||
tega
|
|
||||||
ti
|
|
||||||
tista
|
|
||||||
tiste
|
|
||||||
tisti
|
|
||||||
tisto
|
|
||||||
tj.
|
|
||||||
tja
|
|
||||||
to
|
|
||||||
toda
|
|
||||||
tu
|
|
||||||
tudi
|
|
||||||
tukaj
|
|
||||||
tvoj
|
|
||||||
tvoja
|
|
||||||
tvoje
|
|
||||||
u
|
u
|
||||||
v
|
|
||||||
vaju
|
v vaju vam vas vaš vaša vaše ve vedno velik velika veliki
|
||||||
vam
|
veliko vendar ves več vi vidva vii viii visok visoka visoke
|
||||||
vas
|
visoki vsa vsaj vsak vsaka vsakdo vsake vsaki vsakomur vse
|
||||||
vaš
|
vsega vsi vso včasih včeraj
|
||||||
vaša
|
|
||||||
vaše
|
|
||||||
ve
|
|
||||||
vedno
|
|
||||||
vendar
|
|
||||||
ves
|
|
||||||
več
|
|
||||||
vi
|
|
||||||
vidva
|
|
||||||
vii
|
|
||||||
viii
|
|
||||||
vsa
|
|
||||||
vsaj
|
|
||||||
vsak
|
|
||||||
vsaka
|
|
||||||
vsakdo
|
|
||||||
vsake
|
|
||||||
vsaki
|
|
||||||
vsakomur
|
|
||||||
vse
|
|
||||||
vsega
|
|
||||||
vsi
|
|
||||||
vso
|
|
||||||
včasih
|
|
||||||
x
|
x
|
||||||
z
|
|
||||||
za
|
z za zadaj zadnji zakaj zaprta zaprti zaprto zdaj zelo zunaj
|
||||||
zadaj
|
|
||||||
zadnji
|
ž že
|
||||||
zakaj
|
|
||||||
zdaj
|
|
||||||
zelo
|
|
||||||
zunaj
|
|
||||||
č
|
|
||||||
če
|
|
||||||
često
|
|
||||||
čez
|
|
||||||
čigav
|
|
||||||
š
|
|
||||||
ž
|
|
||||||
že
|
|
||||||
""".split()
|
""".split()
|
||||||
)
|
)
|
||||||
|
|
272
spacy/lang/sl/tokenizer_exceptions.py
Normal file
272
spacy/lang/sl/tokenizer_exceptions.py
Normal file
|
@ -0,0 +1,272 @@
|
||||||
|
from typing import Dict, List
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
from ...symbols import ORTH, NORM
|
||||||
|
from ...util import update_exc
|
||||||
|
|
||||||
|
_exc: Dict[str, List[Dict]] = {}
|
||||||
|
|
||||||
|
_other_exc = {
|
||||||
|
"t.i.": [{ORTH: "t.", NORM: "tako"}, {ORTH: "i.", NORM: "imenovano"}],
|
||||||
|
"t.j.": [{ORTH: "t.", NORM: "to"}, {ORTH: "j.", NORM: "je"}],
|
||||||
|
"T.j.": [{ORTH: "T.", NORM: "to"}, {ORTH: "j.", NORM: "je"}],
|
||||||
|
"d.o.o.": [
|
||||||
|
{ORTH: "d.", NORM: "družba"},
|
||||||
|
{ORTH: "o.", NORM: "omejeno"},
|
||||||
|
{ORTH: "o.", NORM: "odgovornostjo"},
|
||||||
|
],
|
||||||
|
"D.O.O.": [
|
||||||
|
{ORTH: "D.", NORM: "družba"},
|
||||||
|
{ORTH: "O.", NORM: "omejeno"},
|
||||||
|
{ORTH: "O.", NORM: "odgovornostjo"},
|
||||||
|
],
|
||||||
|
"d.n.o.": [
|
||||||
|
{ORTH: "d.", NORM: "družba"},
|
||||||
|
{ORTH: "n.", NORM: "neomejeno"},
|
||||||
|
{ORTH: "o.", NORM: "odgovornostjo"},
|
||||||
|
],
|
||||||
|
"D.N.O.": [
|
||||||
|
{ORTH: "D.", NORM: "družba"},
|
||||||
|
{ORTH: "N.", NORM: "neomejeno"},
|
||||||
|
{ORTH: "O.", NORM: "odgovornostjo"},
|
||||||
|
],
|
||||||
|
"d.d.": [{ORTH: "d.", NORM: "delniška"}, {ORTH: "d.", NORM: "družba"}],
|
||||||
|
"D.D.": [{ORTH: "D.", NORM: "delniška"}, {ORTH: "D.", NORM: "družba"}],
|
||||||
|
"s.p.": [{ORTH: "s.", NORM: "samostojni"}, {ORTH: "p.", NORM: "podjetnik"}],
|
||||||
|
"S.P.": [{ORTH: "S.", NORM: "samostojni"}, {ORTH: "P.", NORM: "podjetnik"}],
|
||||||
|
"l.r.": [{ORTH: "l.", NORM: "lastno"}, {ORTH: "r.", NORM: "ročno"}],
|
||||||
|
"le-te": [{ORTH: "le"}, {ORTH: "-"}, {ORTH: "te"}],
|
||||||
|
"Le-te": [{ORTH: "Le"}, {ORTH: "-"}, {ORTH: "te"}],
|
||||||
|
"le-ti": [{ORTH: "le"}, {ORTH: "-"}, {ORTH: "ti"}],
|
||||||
|
"Le-ti": [{ORTH: "Le"}, {ORTH: "-"}, {ORTH: "ti"}],
|
||||||
|
"le-to": [{ORTH: "le"}, {ORTH: "-"}, {ORTH: "to"}],
|
||||||
|
"Le-to": [{ORTH: "Le"}, {ORTH: "-"}, {ORTH: "to"}],
|
||||||
|
"le-ta": [{ORTH: "le"}, {ORTH: "-"}, {ORTH: "ta"}],
|
||||||
|
"Le-ta": [{ORTH: "Le"}, {ORTH: "-"}, {ORTH: "ta"}],
|
||||||
|
"le-tega": [{ORTH: "le"}, {ORTH: "-"}, {ORTH: "tega"}],
|
||||||
|
"Le-tega": [{ORTH: "Le"}, {ORTH: "-"}, {ORTH: "tega"}],
|
||||||
|
}
|
||||||
|
|
||||||
|
_exc.update(_other_exc)
|
||||||
|
|
||||||
|
|
||||||
|
for exc_data in [
|
||||||
|
{ORTH: "adm.", NORM: "administracija"},
|
||||||
|
{ORTH: "aer.", NORM: "aeronavtika"},
|
||||||
|
{ORTH: "agr.", NORM: "agronomija"},
|
||||||
|
{ORTH: "amer.", NORM: "ameriško"},
|
||||||
|
{ORTH: "anat.", NORM: "anatomija"},
|
||||||
|
{ORTH: "angl.", NORM: "angleški"},
|
||||||
|
{ORTH: "ant.", NORM: "antonim"},
|
||||||
|
{ORTH: "antr.", NORM: "antropologija"},
|
||||||
|
{ORTH: "apr.", NORM: "april"},
|
||||||
|
{ORTH: "arab.", NORM: "arabsko"},
|
||||||
|
{ORTH: "arheol.", NORM: "arheologija"},
|
||||||
|
{ORTH: "arhit.", NORM: "arhitektura"},
|
||||||
|
{ORTH: "avg.", NORM: "avgust"},
|
||||||
|
{ORTH: "avstr.", NORM: "avstrijsko"},
|
||||||
|
{ORTH: "avt.", NORM: "avtomobilizem"},
|
||||||
|
{ORTH: "bibl.", NORM: "biblijsko"},
|
||||||
|
{ORTH: "biokem.", NORM: "biokemija"},
|
||||||
|
{ORTH: "biol.", NORM: "biologija"},
|
||||||
|
{ORTH: "bolg.", NORM: "bolgarski"},
|
||||||
|
{ORTH: "bot.", NORM: "botanika"},
|
||||||
|
{ORTH: "cit.", NORM: "citat"},
|
||||||
|
{ORTH: "daj.", NORM: "dajalnik"},
|
||||||
|
{ORTH: "del.", NORM: "deležnik"},
|
||||||
|
{ORTH: "ed.", NORM: "ednina"},
|
||||||
|
{ORTH: "etn.", NORM: "etnografija"},
|
||||||
|
{ORTH: "farm.", NORM: "farmacija"},
|
||||||
|
{ORTH: "filat.", NORM: "filatelija"},
|
||||||
|
{ORTH: "filoz.", NORM: "filozofija"},
|
||||||
|
{ORTH: "fin.", NORM: "finančništvo"},
|
||||||
|
{ORTH: "fiz.", NORM: "fizika"},
|
||||||
|
{ORTH: "fot.", NORM: "fotografija"},
|
||||||
|
{ORTH: "fr.", NORM: "francoski"},
|
||||||
|
{ORTH: "friz.", NORM: "frizerstvo"},
|
||||||
|
{ORTH: "gastr.", NORM: "gastronomija"},
|
||||||
|
{ORTH: "geogr.", NORM: "geografija"},
|
||||||
|
{ORTH: "geol.", NORM: "geologija"},
|
||||||
|
{ORTH: "geom.", NORM: "geometrija"},
|
||||||
|
{ORTH: "germ.", NORM: "germanski"},
|
||||||
|
{ORTH: "gl.", NORM: "glej"},
|
||||||
|
{ORTH: "glag.", NORM: "glagolski"},
|
||||||
|
{ORTH: "glasb.", NORM: "glasba"},
|
||||||
|
{ORTH: "gled.", NORM: "gledališče"},
|
||||||
|
{ORTH: "gost.", NORM: "gostinstvo"},
|
||||||
|
{ORTH: "gozd.", NORM: "gozdarstvo"},
|
||||||
|
{ORTH: "gr.", NORM: "grški"},
|
||||||
|
{ORTH: "grad.", NORM: "gradbeništvo"},
|
||||||
|
{ORTH: "hebr.", NORM: "hebrejsko"},
|
||||||
|
{ORTH: "hrv.", NORM: "hrvaško"},
|
||||||
|
{ORTH: "ide.", NORM: "indoevropsko"},
|
||||||
|
{ORTH: "igr.", NORM: "igre"},
|
||||||
|
{ORTH: "im.", NORM: "imenovalnik"},
|
||||||
|
{ORTH: "iron.", NORM: "ironično"},
|
||||||
|
{ORTH: "it.", NORM: "italijanski"},
|
||||||
|
{ORTH: "itd.", NORM: "in tako dalje"},
|
||||||
|
{ORTH: "itn.", NORM: "in tako naprej"},
|
||||||
|
{ORTH: "ipd.", NORM: "in podobno"},
|
||||||
|
{ORTH: "jap.", NORM: "japonsko"},
|
||||||
|
{ORTH: "jul.", NORM: "julij"},
|
||||||
|
{ORTH: "jun.", NORM: "junij"},
|
||||||
|
{ORTH: "kit.", NORM: "kitajsko"},
|
||||||
|
{ORTH: "knj.", NORM: "knjižno"},
|
||||||
|
{ORTH: "knjiž.", NORM: "knjižno"},
|
||||||
|
{ORTH: "kor.", NORM: "koreografija"},
|
||||||
|
{ORTH: "lat.", NORM: "latinski"},
|
||||||
|
{ORTH: "les.", NORM: "lesna stroka"},
|
||||||
|
{ORTH: "lingv.", NORM: "lingvistika"},
|
||||||
|
{ORTH: "lit.", NORM: "literarni"},
|
||||||
|
{ORTH: "ljubk.", NORM: "ljubkovalno"},
|
||||||
|
{ORTH: "lov.", NORM: "lovstvo"},
|
||||||
|
{ORTH: "m.", NORM: "moški"},
|
||||||
|
{ORTH: "mak.", NORM: "makedonski"},
|
||||||
|
{ORTH: "mar.", NORM: "marec"},
|
||||||
|
{ORTH: "mat.", NORM: "matematika"},
|
||||||
|
{ORTH: "med.", NORM: "medicina"},
|
||||||
|
{ORTH: "meh.", NORM: "mehiško"},
|
||||||
|
{ORTH: "mest.", NORM: "mestnik"},
|
||||||
|
{ORTH: "mdr.", NORM: "med drugim"},
|
||||||
|
{ORTH: "min.", NORM: "mineralogija"},
|
||||||
|
{ORTH: "mitol.", NORM: "mitologija"},
|
||||||
|
{ORTH: "mn.", NORM: "množina"},
|
||||||
|
{ORTH: "mont.", NORM: "montanistika"},
|
||||||
|
{ORTH: "muz.", NORM: "muzikologija"},
|
||||||
|
{ORTH: "nam.", NORM: "namenilnik"},
|
||||||
|
{ORTH: "nar.", NORM: "narečno"},
|
||||||
|
{ORTH: "nav.", NORM: "navadno"},
|
||||||
|
{ORTH: "nedol.", NORM: "nedoločnik"},
|
||||||
|
{ORTH: "nedov.", NORM: "nedovršni"},
|
||||||
|
{ORTH: "neprav.", NORM: "nepravilno"},
|
||||||
|
{ORTH: "nepreh.", NORM: "neprehodno"},
|
||||||
|
{ORTH: "neskl.", NORM: "nesklonljiv(o)"},
|
||||||
|
{ORTH: "nestrok.", NORM: "nestrokovno"},
|
||||||
|
{ORTH: "num.", NORM: "numizmatika"},
|
||||||
|
{ORTH: "npr.", NORM: "na primer"},
|
||||||
|
{ORTH: "obrt.", NORM: "obrtništvo"},
|
||||||
|
{ORTH: "okt.", NORM: "oktober"},
|
||||||
|
{ORTH: "or.", NORM: "orodnik"},
|
||||||
|
{ORTH: "os.", NORM: "oseba"},
|
||||||
|
{ORTH: "otr.", NORM: "otroško"},
|
||||||
|
{ORTH: "oz.", NORM: "oziroma"},
|
||||||
|
{ORTH: "pal.", NORM: "paleontologija"},
|
||||||
|
{ORTH: "papir.", NORM: "papirništvo"},
|
||||||
|
{ORTH: "ped.", NORM: "pedagogika"},
|
||||||
|
{ORTH: "pisar.", NORM: "pisarniško"},
|
||||||
|
{ORTH: "pog.", NORM: "pogovorno"},
|
||||||
|
{ORTH: "polit.", NORM: "politika"},
|
||||||
|
{ORTH: "polj.", NORM: "poljsko"},
|
||||||
|
{ORTH: "poljud.", NORM: "poljudno"},
|
||||||
|
{ORTH: "preg.", NORM: "pregovor"},
|
||||||
|
{ORTH: "preh.", NORM: "prehodno"},
|
||||||
|
{ORTH: "pren.", NORM: "preneseno"},
|
||||||
|
{ORTH: "prid.", NORM: "pridevnik"},
|
||||||
|
{ORTH: "prim.", NORM: "primerjaj"},
|
||||||
|
{ORTH: "prisl.", NORM: "prislov"},
|
||||||
|
{ORTH: "psih.", NORM: "psihologija"},
|
||||||
|
{ORTH: "psiht.", NORM: "psihiatrija"},
|
||||||
|
{ORTH: "rad.", NORM: "radiotehnika"},
|
||||||
|
{ORTH: "rač.", NORM: "računalništvo"},
|
||||||
|
{ORTH: "rib.", NORM: "ribištvo"},
|
||||||
|
{ORTH: "rod.", NORM: "rodilnik"},
|
||||||
|
{ORTH: "rus.", NORM: "rusko"},
|
||||||
|
{ORTH: "s.", NORM: "srednji"},
|
||||||
|
{ORTH: "sam.", NORM: "samostalniški"},
|
||||||
|
{ORTH: "sed.", NORM: "sedanjik"},
|
||||||
|
{ORTH: "sep.", NORM: "september"},
|
||||||
|
{ORTH: "slabš.", NORM: "slabšalno"},
|
||||||
|
{ORTH: "slovan.", NORM: "slovansko"},
|
||||||
|
{ORTH: "slovaš.", NORM: "slovaško"},
|
||||||
|
{ORTH: "srb.", NORM: "srbsko"},
|
||||||
|
{ORTH: "star.", NORM: "starinsko"},
|
||||||
|
{ORTH: "stil.", NORM: "stilno"},
|
||||||
|
{ORTH: "sv.", NORM: "svet(i)"},
|
||||||
|
{ORTH: "teh.", NORM: "tehnika"},
|
||||||
|
{ORTH: "tisk.", NORM: "tiskarstvo"},
|
||||||
|
{ORTH: "tj.", NORM: "to je"},
|
||||||
|
{ORTH: "tož.", NORM: "tožilnik"},
|
||||||
|
{ORTH: "trg.", NORM: "trgovina"},
|
||||||
|
{ORTH: "ukr.", NORM: "ukrajinski"},
|
||||||
|
{ORTH: "um.", NORM: "umetnost"},
|
||||||
|
{ORTH: "vel.", NORM: "velelnik"},
|
||||||
|
{ORTH: "vet.", NORM: "veterina"},
|
||||||
|
{ORTH: "vez.", NORM: "veznik"},
|
||||||
|
{ORTH: "vn.", NORM: "visokonemško"},
|
||||||
|
{ORTH: "voj.", NORM: "vojska"},
|
||||||
|
{ORTH: "vrtn.", NORM: "vrtnarstvo"},
|
||||||
|
{ORTH: "vulg.", NORM: "vulgarno"},
|
||||||
|
{ORTH: "vznes.", NORM: "vzneseno"},
|
||||||
|
{ORTH: "zal.", NORM: "založništvo"},
|
||||||
|
{ORTH: "zastar.", NORM: "zastarelo"},
|
||||||
|
{ORTH: "zgod.", NORM: "zgodovina"},
|
||||||
|
{ORTH: "zool.", NORM: "zoologija"},
|
||||||
|
{ORTH: "čeb.", NORM: "čebelarstvo"},
|
||||||
|
{ORTH: "češ.", NORM: "češki"},
|
||||||
|
{ORTH: "člov.", NORM: "človeškost"},
|
||||||
|
{ORTH: "šah.", NORM: "šahovski"},
|
||||||
|
{ORTH: "šalj.", NORM: "šaljivo"},
|
||||||
|
{ORTH: "šp.", NORM: "španski"},
|
||||||
|
{ORTH: "špan.", NORM: "špansko"},
|
||||||
|
{ORTH: "šport.", NORM: "športni"},
|
||||||
|
{ORTH: "štev.", NORM: "števnik"},
|
||||||
|
{ORTH: "šved.", NORM: "švedsko"},
|
||||||
|
{ORTH: "švic.", NORM: "švicarsko"},
|
||||||
|
{ORTH: "ž.", NORM: "ženski"},
|
||||||
|
{ORTH: "žarg.", NORM: "žargonsko"},
|
||||||
|
{ORTH: "žel.", NORM: "železnica"},
|
||||||
|
{ORTH: "živ.", NORM: "živost"},
|
||||||
|
]:
|
||||||
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
|
|
||||||
|
abbrv = """
|
||||||
|
Co. Ch. DIPL. DR. Dr. Ev. Inc. Jr. Kr. Mag. M. MR. Mr. Mt. Murr. Npr. OZ.
|
||||||
|
Opr. Osn. Prim. Roj. ST. Sim. Sp. Sred. St. Sv. Škofl. Tel. UR. Zb.
|
||||||
|
a. aa. ab. abc. abit. abl. abs. abt. acc. accel. add. adj. adv. aet. afr. akad. al. alban. all. alleg.
|
||||||
|
alp. alt. alter. alžir. am. an. andr. ang. anh. anon. ans. antrop. apoc. app. approx. apt. ar. arc. arch.
|
||||||
|
arh. arr. as. asist. assist. assoc. asst. astr. attn. aug. avstral. az. b. bab. bal. bbl. bd. belg. bioinf.
|
||||||
|
biomed. bk. bl. bn. borg. bp. br. braz. brit. bros. broš. bt. bu. c. ca. cal. can. cand. cantab. cap. capt.
|
||||||
|
cat. cath. cc. cca. cd. cdr. cdre. cent. cerkv. cert. cf. cfr. ch. chap. chem. chr. chs. cic. circ. civ. cl.
|
||||||
|
cm. cmd. cnr. co. cod. col. coll. colo. com. comp. con. conc. cond. conn. cons. cont. coop. corr. cost. cp.
|
||||||
|
cpl. cr. crd. cres. cresc. ct. cu. d. dan. dat. davč. ddr. dec. ded. def. dem. dent. dept. dia. dip. dipl.
|
||||||
|
dir. disp. diss. div. do. doc. dok. dol. doo. dop. dott. dr. dram. druž. družb. drž. dt. duh. dur. dvr. dwt. e.
|
||||||
|
ea. ecc. eccl. eccles. econ. edn. egipt. egr. ekon. eksp. el. em. enc. eng. eo. ep. err. esp. esq. est.
|
||||||
|
et. etc. etnogr. etnol. ev. evfem. evr. ex. exc. excl. exp. expl. ext. exx. f. fa. facs. fak. faks. fas.
|
||||||
|
fasc. fco. fcp. feb. febr. fec. fed. fem. ff. fff. fid. fig. fil. film. fiziol. fiziot. flam. fm. fo. fol. folk.
|
||||||
|
frag. fran. franc. fsc. g. ga. gal. gdč. ge. gen. geod. geog. geotehnol. gg. gimn. glas. glav. gnr. go. gor.
|
||||||
|
gosp. gp. graf. gram. gren. grš. gs. h. hab. hf. hist. ho. hort. i. ia. ib. ibid. id. idr. idridr. ill. imen.
|
||||||
|
imp. impf. impr. in. inc. incl. ind. indus. inf. inform. ing. init. ins. int. inv. inšp. inštr. inž. is. islam.
|
||||||
|
ist. ital. iur. iz. izbr. izd. izg. izgr. izr. izv. j. jak. jam. jan. jav. je. jez. jr. jsl. jud. jug.
|
||||||
|
jugoslovan. jur. juž. jv. jz. k. kal. kan. kand. kat. kdo. kem. kip. kmet. kol. kom. komp. konf. kont. kost. kov.
|
||||||
|
kp. kpfw. kr. kraj. krat. kub. kult. kv. kval. l. la. lab. lb. ld. let. lib. lik. litt. lj. ljud. ll. loc. log.
|
||||||
|
loč. lt. ma. madž. mag. manag. manjš. masc. mass. mater. max. maxmax. mb. md. mech. medic. medij. medn.
|
||||||
|
mehč. mem. menedž. mes. mess. metal. meteor. meteorol. mex. mi. mikr. mil. minn. mio. misc. miss. mit. mk.
|
||||||
|
mkt. ml. mlad. mlle. mlr. mm. mme. množ. mo. moj. moš. možn. mr. mrd. mrs. ms. msc. msgr. mt. murr. mus. mut.
|
||||||
|
n. na. nad. nadalj. nadom. nagl. nakl. namer. nan. naniz. nasl. nat. navt. nač. ned. nem. nik. nizoz. nm. nn.
|
||||||
|
no. nom. norv. notr. nov. novogr. ns. o. ob. obd. obj. oblač. obl. oblik. obr. obraz. obs. obst. obt. obč. oc.
|
||||||
|
oct. od. odd. odg. odn. odst. odv. oec. off. ok. okla. okr. ont. oo. op. opis. opp. opr. orch. ord. ore. oreg.
|
||||||
|
org. orient. orig. ork. ort. oseb. osn. ot. ozir. ošk. p. pag. par. para. parc. parl. part. past. pat. pdk.
|
||||||
|
pen. perf. pert. perz. pesn. pet. pev. pf. pfc. ph. pharm. phil. pis. pl. po. pod. podr. podaljš. pogl. pogoj. pojm.
|
||||||
|
pok. pokr. pol. poljed. poljub. polu. pom. pomen. pon. ponov. pop. por. port. pos. posl. posn. pov. pp. ppl. pr.
|
||||||
|
praet. prav. pravopis. pravosl. preb. pred. predl. predm. predp. preds. pref. pregib. prel. prem. premen. prep.
|
||||||
|
pres. pret. prev. pribl. prih. pril. primerj. primor. prip. pripor. prir. prist. priv. proc. prof. prog. proiz.
|
||||||
|
prom. pron. prop. prot. protest. prov. ps. pss. pt. publ. pz. q. qld. qu. quad. que. r. racc. rastl. razgl.
|
||||||
|
razl. razv. rd. red. ref. reg. rel. relig. rep. repr. rer. resp. rest. ret. rev. revol. rež. rim. rist. rkp. rm.
|
||||||
|
roj. rom. romun. rp. rr. rt. rud. ruš. ry. sal. samogl. san. sc. scen. sci. scr. sdv. seg. sek. sen. sept. ser.
|
||||||
|
sev. sg. sgt. sh. sig. sigg. sign. sim. sin. sing. sinh. skand. skl. sklad. sklanj. sklep. skr. sl. slik. slov.
|
||||||
|
slovak. slovn. sn. so. sob. soc. sociol. sod. sopomen. sopr. sor. sov. sovj. sp. spec. spl. spr. spreg. sq. sr.
|
||||||
|
sre. sred. sredoz. srh. ss. ssp. st. sta. stan. stanstar. stcsl. ste. stim. stol. stom. str. stroj. strok. stsl.
|
||||||
|
stud. sup. supl. suppl. svet. sz. t. tab. tech. ted. tehn. tehnol. tek. teks. tekst. tel. temp. ten. teol. ter.
|
||||||
|
term. test. th. theol. tim. tip. tisočl. tit. tl. tol. tolmač. tom. tor. tov. tr. trad. traj. trans. tren.
|
||||||
|
trib. tril. trop. trp. trž. ts. tt. tu. tur. turiz. tvor. tvorb. tč. u. ul. umet. un. univ. up. upr. ur. urad.
|
||||||
|
us. ust. utr. v. va. val. var. varn. ven. ver. verb. vest. vezal. vic. vis. viv. viz. viš. vod. vok. vol. vpr.
|
||||||
|
vrst. vrstil. vs. vv. vzd. vzg. vzh. vzor. w. wed. wg. wk. x. y. z. zah. zaim. zak. zap. zasl. zavar. zač. zb.
|
||||||
|
združ. zg. zn. znan. znanstv. zoot. zun. zv. zvd. á. é. ć. č. čas. čet. čl. člen. čustv. đ. ľ. ł. ş. ŠT. š. šir.
|
||||||
|
škofl. škot. šol. št. števil. štud. ů. ű. žen. žival.
|
||||||
|
""".split()
|
||||||
|
|
||||||
|
for orth in abbrv:
|
||||||
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
|
@ -17,10 +17,6 @@ URL_PATTERN = (
|
||||||
r"(?:\S+(?::\S*)?@)?"
|
r"(?:\S+(?::\S*)?@)?"
|
||||||
r"(?:"
|
r"(?:"
|
||||||
# IP address exclusion
|
# IP address exclusion
|
||||||
# private & local networks
|
|
||||||
r"(?!(?:10|127)(?:\.\d{1,3}){3})"
|
|
||||||
r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
|
|
||||||
r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
|
|
||||||
# IP address dotted notation octets
|
# IP address dotted notation octets
|
||||||
# excludes loopback network 0.0.0.0
|
# excludes loopback network 0.0.0.0
|
||||||
# excludes reserved space >= 224.0.0.0
|
# excludes reserved space >= 224.0.0.0
|
||||||
|
|
|
@ -29,7 +29,7 @@ class Ukrainian(Language):
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={
|
default_config={
|
||||||
"model": None,
|
"model": None,
|
||||||
"mode": "pymorphy2",
|
"mode": "pymorphy3",
|
||||||
"overwrite": False,
|
"overwrite": False,
|
||||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
},
|
},
|
||||||
|
|
|
@ -14,7 +14,7 @@ class UkrainianLemmatizer(RussianLemmatizer):
|
||||||
model: Optional[Model],
|
model: Optional[Model],
|
||||||
name: str = "lemmatizer",
|
name: str = "lemmatizer",
|
||||||
*,
|
*,
|
||||||
mode: str = "pymorphy2",
|
mode: str = "pymorphy3",
|
||||||
overwrite: bool = False,
|
overwrite: bool = False,
|
||||||
scorer: Optional[Callable] = lemmatizer_score,
|
scorer: Optional[Callable] = lemmatizer_score,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
@ -29,6 +29,17 @@ class UkrainianLemmatizer(RussianLemmatizer):
|
||||||
) from None
|
) from None
|
||||||
if getattr(self, "_morph", None) is None:
|
if getattr(self, "_morph", None) is None:
|
||||||
self._morph = MorphAnalyzer(lang="uk")
|
self._morph = MorphAnalyzer(lang="uk")
|
||||||
|
elif mode == "pymorphy3":
|
||||||
|
try:
|
||||||
|
from pymorphy3 import MorphAnalyzer
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"The Ukrainian lemmatizer mode 'pymorphy3' requires the "
|
||||||
|
"pymorphy3 library and dictionaries. Install them with: "
|
||||||
|
"pip install pymorphy3 pymorphy3-dicts-uk"
|
||||||
|
) from None
|
||||||
|
if getattr(self, "_morph", None) is None:
|
||||||
|
self._morph = MorphAnalyzer(lang="uk")
|
||||||
super().__init__(
|
super().__init__(
|
||||||
vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
)
|
)
|
||||||
|
|
|
@ -465,6 +465,8 @@ class Language:
|
||||||
"""
|
"""
|
||||||
if not isinstance(name, str):
|
if not isinstance(name, str):
|
||||||
raise ValueError(Errors.E963.format(decorator="factory"))
|
raise ValueError(Errors.E963.format(decorator="factory"))
|
||||||
|
if "." in name:
|
||||||
|
raise ValueError(Errors.E853.format(name=name))
|
||||||
if not isinstance(default_config, dict):
|
if not isinstance(default_config, dict):
|
||||||
err = Errors.E962.format(
|
err = Errors.E962.format(
|
||||||
style="default config", name=name, cfg_type=type(default_config)
|
style="default config", name=name, cfg_type=type(default_config)
|
||||||
|
@ -543,8 +545,11 @@ class Language:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#component
|
DOCS: https://spacy.io/api/language#component
|
||||||
"""
|
"""
|
||||||
if name is not None and not isinstance(name, str):
|
if name is not None:
|
||||||
raise ValueError(Errors.E963.format(decorator="component"))
|
if not isinstance(name, str):
|
||||||
|
raise ValueError(Errors.E963.format(decorator="component"))
|
||||||
|
if "." in name:
|
||||||
|
raise ValueError(Errors.E853.format(name=name))
|
||||||
component_name = name if name is not None else util.get_object_name(func)
|
component_name = name if name is not None else util.get_object_name(func)
|
||||||
|
|
||||||
def add_component(component_func: "Pipe") -> Callable:
|
def add_component(component_func: "Pipe") -> Callable:
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
from .attributeruler import AttributeRuler
|
from .attribute_ruler import AttributeRuler
|
||||||
from .dep_parser import DependencyParser
|
from .dep_parser import DependencyParser
|
||||||
from .edit_tree_lemmatizer import EditTreeLemmatizer
|
from .edit_tree_lemmatizer import EditTreeLemmatizer
|
||||||
from .entity_linker import EntityLinker
|
from .entity_linker import EntityLinker
|
||||||
from .ner import EntityRecognizer
|
from .ner import EntityRecognizer
|
||||||
from .entityruler import EntityRuler
|
from .entity_ruler import EntityRuler
|
||||||
from .lemmatizer import Lemmatizer
|
from .lemmatizer import Lemmatizer
|
||||||
from .morphologizer import Morphologizer
|
from .morphologizer import Morphologizer
|
||||||
from .pipe import Pipe
|
from .pipe import Pipe
|
||||||
|
|
|
@ -207,7 +207,7 @@ class TokenPatternOperatorSimple(str, Enum):
|
||||||
|
|
||||||
|
|
||||||
class TokenPatternOperatorMinMax(ConstrainedStr):
|
class TokenPatternOperatorMinMax(ConstrainedStr):
|
||||||
regex = re.compile("^({\d+}|{\d+,\d*}|{\d*,\d+})$")
|
regex = re.compile(r"^({\d+}|{\d+,\d*}|{\d*,\d+})$")
|
||||||
|
|
||||||
|
|
||||||
TokenPatternOperator = Union[TokenPatternOperatorSimple, TokenPatternOperatorMinMax]
|
TokenPatternOperator = Union[TokenPatternOperatorSimple, TokenPatternOperatorMinMax]
|
||||||
|
@ -514,6 +514,14 @@ class DocJSONSchema(BaseModel):
|
||||||
tokens: List[Dict[StrictStr, Union[StrictStr, StrictInt]]] = Field(
|
tokens: List[Dict[StrictStr, Union[StrictStr, StrictInt]]] = Field(
|
||||||
..., title="Token information - ID, start, annotations"
|
..., title="Token information - ID, start, annotations"
|
||||||
)
|
)
|
||||||
_: Optional[Dict[StrictStr, Any]] = Field(
|
underscore_doc: Optional[Dict[StrictStr, Any]] = Field(
|
||||||
None, title="Any custom data stored in the document's _ attribute"
|
None,
|
||||||
|
title="Any custom data stored in the document's _ attribute",
|
||||||
|
alias="_",
|
||||||
|
)
|
||||||
|
underscore_token: Optional[Dict[StrictStr, Dict[StrictStr, Any]]] = Field(
|
||||||
|
None, title="Any custom data stored in the token's _ attribute"
|
||||||
|
)
|
||||||
|
underscore_span: Optional[Dict[StrictStr, Dict[StrictStr, Any]]] = Field(
|
||||||
|
None, title="Any custom data stored in the span's _ attribute"
|
||||||
)
|
)
|
||||||
|
|
|
@ -239,7 +239,7 @@ def hsb_tokenizer():
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def ko_tokenizer():
|
def ko_tokenizer():
|
||||||
pytest.importorskip("natto")
|
pytest.importorskip("mecab_ko")
|
||||||
return get_lang_class("ko")().tokenizer
|
return get_lang_class("ko")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
@ -256,11 +256,30 @@ def ko_tokenizer_tokenizer():
|
||||||
return nlp.tokenizer
|
return nlp.tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def ko_tokenizer_natto():
|
||||||
|
pytest.importorskip("natto")
|
||||||
|
config = {
|
||||||
|
"nlp": {
|
||||||
|
"tokenizer": {
|
||||||
|
"@tokenizers": "spacy.KoreanNattoTokenizer.v1",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
nlp = get_lang_class("ko").from_config(config)
|
||||||
|
return nlp.tokenizer
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def lb_tokenizer():
|
def lb_tokenizer():
|
||||||
return get_lang_class("lb")().tokenizer
|
return get_lang_class("lb")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def lg_tokenizer():
|
||||||
|
return get_lang_class("lg")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def lt_tokenizer():
|
def lt_tokenizer():
|
||||||
return get_lang_class("lt")().tokenizer
|
return get_lang_class("lt")().tokenizer
|
||||||
|
@ -323,13 +342,13 @@ def ro_tokenizer():
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def ru_tokenizer():
|
def ru_tokenizer():
|
||||||
pytest.importorskip("pymorphy2")
|
pytest.importorskip("pymorphy3")
|
||||||
return get_lang_class("ru")().tokenizer
|
return get_lang_class("ru")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def ru_lemmatizer():
|
def ru_lemmatizer():
|
||||||
pytest.importorskip("pymorphy2")
|
pytest.importorskip("pymorphy3")
|
||||||
return get_lang_class("ru")().add_pipe("lemmatizer")
|
return get_lang_class("ru")().add_pipe("lemmatizer")
|
||||||
|
|
||||||
|
|
||||||
|
@ -401,14 +420,14 @@ def ky_tokenizer():
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def uk_tokenizer():
|
def uk_tokenizer():
|
||||||
pytest.importorskip("pymorphy2")
|
pytest.importorskip("pymorphy3")
|
||||||
return get_lang_class("uk")().tokenizer
|
return get_lang_class("uk")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def uk_lemmatizer():
|
def uk_lemmatizer():
|
||||||
pytest.importorskip("pymorphy2")
|
pytest.importorskip("pymorphy3")
|
||||||
pytest.importorskip("pymorphy2_dicts_uk")
|
pytest.importorskip("pymorphy3_dicts_uk")
|
||||||
return get_lang_class("uk")().add_pipe("lemmatizer")
|
return get_lang_class("uk")().add_pipe("lemmatizer")
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -45,6 +45,33 @@ def test_ents_reset(en_vocab):
|
||||||
assert [t.ent_iob_ for t in doc] == orig_iobs
|
assert [t.ent_iob_ for t in doc] == orig_iobs
|
||||||
|
|
||||||
|
|
||||||
|
def test_ents_clear(en_vocab):
|
||||||
|
"""Ensure that removing entities clears token attributes"""
|
||||||
|
text = ["Louisiana", "Office", "of", "Conservation"]
|
||||||
|
doc = Doc(en_vocab, words=text)
|
||||||
|
entity = Span(doc, 0, 4, label=391, span_id="TEST")
|
||||||
|
doc.ents = [entity]
|
||||||
|
doc.ents = []
|
||||||
|
for token in doc:
|
||||||
|
assert token.ent_iob == 2
|
||||||
|
assert token.ent_type == 0
|
||||||
|
assert token.ent_id == 0
|
||||||
|
assert token.ent_kb_id == 0
|
||||||
|
doc.ents = [entity]
|
||||||
|
doc.set_ents([], default="missing")
|
||||||
|
for token in doc:
|
||||||
|
assert token.ent_iob == 0
|
||||||
|
assert token.ent_type == 0
|
||||||
|
assert token.ent_id == 0
|
||||||
|
assert token.ent_kb_id == 0
|
||||||
|
doc.set_ents([], default="blocked")
|
||||||
|
for token in doc:
|
||||||
|
assert token.ent_iob == 3
|
||||||
|
assert token.ent_type == 0
|
||||||
|
assert token.ent_id == 0
|
||||||
|
assert token.ent_kb_id == 0
|
||||||
|
|
||||||
|
|
||||||
def test_add_overlapping_entities(en_vocab):
|
def test_add_overlapping_entities(en_vocab):
|
||||||
text = ["Louisiana", "Office", "of", "Conservation"]
|
text = ["Louisiana", "Office", "of", "Conservation"]
|
||||||
doc = Doc(en_vocab, words=text)
|
doc = Doc(en_vocab, words=text)
|
||||||
|
|
|
@ -3,6 +3,7 @@ import weakref
|
||||||
import numpy
|
import numpy
|
||||||
from numpy.testing import assert_array_equal
|
from numpy.testing import assert_array_equal
|
||||||
import pytest
|
import pytest
|
||||||
|
import warnings
|
||||||
from thinc.api import NumpyOps, get_current_ops
|
from thinc.api import NumpyOps, get_current_ops
|
||||||
|
|
||||||
from spacy.attrs import DEP, ENT_IOB, ENT_TYPE, HEAD, IS_ALPHA, MORPH, POS
|
from spacy.attrs import DEP, ENT_IOB, ENT_TYPE, HEAD, IS_ALPHA, MORPH, POS
|
||||||
|
@ -529,9 +530,9 @@ def test_doc_from_array_sent_starts(en_vocab):
|
||||||
# no warning using default attrs
|
# no warning using default attrs
|
||||||
attrs = doc._get_array_attrs()
|
attrs = doc._get_array_attrs()
|
||||||
arr = doc.to_array(attrs)
|
arr = doc.to_array(attrs)
|
||||||
with pytest.warns(None) as record:
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("error")
|
||||||
new_doc.from_array(attrs, arr)
|
new_doc.from_array(attrs, arr)
|
||||||
assert len(record) == 0
|
|
||||||
# only SENT_START uses SENT_START
|
# only SENT_START uses SENT_START
|
||||||
attrs = [SENT_START]
|
attrs = [SENT_START]
|
||||||
arr = doc.to_array(attrs)
|
arr = doc.to_array(attrs)
|
||||||
|
|
|
@ -1,12 +1,15 @@
|
||||||
import pytest
|
import pytest
|
||||||
import spacy
|
import spacy
|
||||||
from spacy import schemas
|
from spacy import schemas
|
||||||
from spacy.tokens import Doc, Span
|
from spacy.tokens import Doc, Span, Token
|
||||||
|
import srsly
|
||||||
|
from .test_underscore import clean_underscore # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture()
|
@pytest.fixture()
|
||||||
def doc(en_vocab):
|
def doc(en_vocab):
|
||||||
words = ["c", "d", "e"]
|
words = ["c", "d", "e"]
|
||||||
|
spaces = [True, True, True]
|
||||||
pos = ["VERB", "NOUN", "NOUN"]
|
pos = ["VERB", "NOUN", "NOUN"]
|
||||||
tags = ["VBP", "NN", "NN"]
|
tags = ["VBP", "NN", "NN"]
|
||||||
heads = [0, 0, 1]
|
heads = [0, 0, 1]
|
||||||
|
@ -17,6 +20,7 @@ def doc(en_vocab):
|
||||||
return Doc(
|
return Doc(
|
||||||
en_vocab,
|
en_vocab,
|
||||||
words=words,
|
words=words,
|
||||||
|
spaces=spaces,
|
||||||
pos=pos,
|
pos=pos,
|
||||||
tags=tags,
|
tags=tags,
|
||||||
heads=heads,
|
heads=heads,
|
||||||
|
@ -45,6 +49,47 @@ def doc_without_deps(en_vocab):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def doc_json():
|
||||||
|
return {
|
||||||
|
"text": "c d e ",
|
||||||
|
"ents": [{"start": 2, "end": 3, "label": "ORG"}],
|
||||||
|
"sents": [{"start": 0, "end": 5}],
|
||||||
|
"tokens": [
|
||||||
|
{
|
||||||
|
"id": 0,
|
||||||
|
"start": 0,
|
||||||
|
"end": 1,
|
||||||
|
"tag": "VBP",
|
||||||
|
"pos": "VERB",
|
||||||
|
"morph": "Feat1=A",
|
||||||
|
"dep": "ROOT",
|
||||||
|
"head": 0,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"start": 2,
|
||||||
|
"end": 3,
|
||||||
|
"tag": "NN",
|
||||||
|
"pos": "NOUN",
|
||||||
|
"morph": "Feat1=B",
|
||||||
|
"dep": "dobj",
|
||||||
|
"head": 0,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"start": 4,
|
||||||
|
"end": 5,
|
||||||
|
"tag": "NN",
|
||||||
|
"pos": "NOUN",
|
||||||
|
"morph": "Feat1=A|Feat2=D",
|
||||||
|
"dep": "dobj",
|
||||||
|
"head": 1,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def test_doc_to_json(doc):
|
def test_doc_to_json(doc):
|
||||||
json_doc = doc.to_json()
|
json_doc = doc.to_json()
|
||||||
assert json_doc["text"] == "c d e "
|
assert json_doc["text"] == "c d e "
|
||||||
|
@ -56,7 +101,8 @@ def test_doc_to_json(doc):
|
||||||
assert json_doc["ents"][0]["start"] == 2 # character offset!
|
assert json_doc["ents"][0]["start"] == 2 # character offset!
|
||||||
assert json_doc["ents"][0]["end"] == 3 # character offset!
|
assert json_doc["ents"][0]["end"] == 3 # character offset!
|
||||||
assert json_doc["ents"][0]["label"] == "ORG"
|
assert json_doc["ents"][0]["label"] == "ORG"
|
||||||
assert not schemas.validate(schemas.DocJSONSchema, json_doc)
|
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
|
||||||
|
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
|
||||||
|
|
||||||
|
|
||||||
def test_doc_to_json_underscore(doc):
|
def test_doc_to_json_underscore(doc):
|
||||||
|
@ -64,11 +110,96 @@ def test_doc_to_json_underscore(doc):
|
||||||
Doc.set_extension("json_test2", default=False)
|
Doc.set_extension("json_test2", default=False)
|
||||||
doc._.json_test1 = "hello world"
|
doc._.json_test1 = "hello world"
|
||||||
doc._.json_test2 = [1, 2, 3]
|
doc._.json_test2 = [1, 2, 3]
|
||||||
|
|
||||||
json_doc = doc.to_json(underscore=["json_test1", "json_test2"])
|
json_doc = doc.to_json(underscore=["json_test1", "json_test2"])
|
||||||
assert "_" in json_doc
|
assert "_" in json_doc
|
||||||
assert json_doc["_"]["json_test1"] == "hello world"
|
assert json_doc["_"]["json_test1"] == "hello world"
|
||||||
assert json_doc["_"]["json_test2"] == [1, 2, 3]
|
assert json_doc["_"]["json_test2"] == [1, 2, 3]
|
||||||
assert not schemas.validate(schemas.DocJSONSchema, json_doc)
|
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
|
||||||
|
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_to_json_with_token_span_attributes(doc):
|
||||||
|
Doc.set_extension("json_test1", default=False)
|
||||||
|
Doc.set_extension("json_test2", default=False)
|
||||||
|
Token.set_extension("token_test", default=False)
|
||||||
|
Span.set_extension("span_test", default=False)
|
||||||
|
|
||||||
|
doc._.json_test1 = "hello world"
|
||||||
|
doc._.json_test2 = [1, 2, 3]
|
||||||
|
doc[0:1]._.span_test = "span_attribute"
|
||||||
|
doc[0]._.token_test = 117
|
||||||
|
doc.spans["span_group"] = [doc[0:1]]
|
||||||
|
json_doc = doc.to_json(
|
||||||
|
underscore=["json_test1", "json_test2", "token_test", "span_test"]
|
||||||
|
)
|
||||||
|
|
||||||
|
assert "_" in json_doc
|
||||||
|
assert json_doc["_"]["json_test1"] == "hello world"
|
||||||
|
assert json_doc["_"]["json_test2"] == [1, 2, 3]
|
||||||
|
assert "underscore_token" in json_doc
|
||||||
|
assert "underscore_span" in json_doc
|
||||||
|
assert json_doc["underscore_token"]["token_test"]["value"] == 117
|
||||||
|
assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute"
|
||||||
|
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
|
||||||
|
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_to_json_with_custom_user_data(doc):
|
||||||
|
Doc.set_extension("json_test", default=False)
|
||||||
|
Token.set_extension("token_test", default=False)
|
||||||
|
Span.set_extension("span_test", default=False)
|
||||||
|
|
||||||
|
doc._.json_test = "hello world"
|
||||||
|
doc[0:1]._.span_test = "span_attribute"
|
||||||
|
doc[0]._.token_test = 117
|
||||||
|
json_doc = doc.to_json(underscore=["json_test", "token_test", "span_test"])
|
||||||
|
doc.user_data["user_data_test"] = 10
|
||||||
|
doc.user_data[("user_data_test2", True)] = 10
|
||||||
|
|
||||||
|
assert "_" in json_doc
|
||||||
|
assert json_doc["_"]["json_test"] == "hello world"
|
||||||
|
assert "underscore_token" in json_doc
|
||||||
|
assert "underscore_span" in json_doc
|
||||||
|
assert json_doc["underscore_token"]["token_test"]["value"] == 117
|
||||||
|
assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute"
|
||||||
|
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
|
||||||
|
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_to_json_with_token_span_same_identifier(doc):
|
||||||
|
Doc.set_extension("my_ext", default=False)
|
||||||
|
Token.set_extension("my_ext", default=False)
|
||||||
|
Span.set_extension("my_ext", default=False)
|
||||||
|
|
||||||
|
doc._.my_ext = "hello world"
|
||||||
|
doc[0:1]._.my_ext = "span_attribute"
|
||||||
|
doc[0]._.my_ext = 117
|
||||||
|
json_doc = doc.to_json(underscore=["my_ext"])
|
||||||
|
|
||||||
|
assert "_" in json_doc
|
||||||
|
assert json_doc["_"]["my_ext"] == "hello world"
|
||||||
|
assert "underscore_token" in json_doc
|
||||||
|
assert "underscore_span" in json_doc
|
||||||
|
assert json_doc["underscore_token"]["my_ext"]["value"] == 117
|
||||||
|
assert json_doc["underscore_span"]["my_ext"]["value"] == "span_attribute"
|
||||||
|
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
|
||||||
|
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_to_json_with_token_attributes_missing(doc):
|
||||||
|
Token.set_extension("token_test", default=False)
|
||||||
|
Span.set_extension("span_test", default=False)
|
||||||
|
|
||||||
|
doc[0:1]._.span_test = "span_attribute"
|
||||||
|
doc[0]._.token_test = 117
|
||||||
|
json_doc = doc.to_json(underscore=["span_test"])
|
||||||
|
|
||||||
|
assert "underscore_token" in json_doc
|
||||||
|
assert "underscore_span" in json_doc
|
||||||
|
assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute"
|
||||||
|
assert "token_test" not in json_doc["underscore_token"]
|
||||||
|
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
|
||||||
|
|
||||||
|
|
||||||
def test_doc_to_json_underscore_error_attr(doc):
|
def test_doc_to_json_underscore_error_attr(doc):
|
||||||
|
@ -94,11 +225,29 @@ def test_doc_to_json_span(doc):
|
||||||
assert len(json_doc["spans"]) == 1
|
assert len(json_doc["spans"]) == 1
|
||||||
assert len(json_doc["spans"]["test"]) == 2
|
assert len(json_doc["spans"]["test"]) == 2
|
||||||
assert json_doc["spans"]["test"][0]["start"] == 0
|
assert json_doc["spans"]["test"][0]["start"] == 0
|
||||||
assert not schemas.validate(schemas.DocJSONSchema, json_doc)
|
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
|
||||||
|
|
||||||
|
|
||||||
def test_json_to_doc(doc):
|
def test_json_to_doc(doc):
|
||||||
new_doc = Doc(doc.vocab).from_json(doc.to_json(), validate=True)
|
json_doc = doc.to_json()
|
||||||
|
json_doc = srsly.json_loads(srsly.json_dumps(json_doc))
|
||||||
|
new_doc = Doc(doc.vocab).from_json(json_doc, validate=True)
|
||||||
|
assert new_doc.text == doc.text == "c d e "
|
||||||
|
assert len(new_doc) == len(doc) == 3
|
||||||
|
assert new_doc[0].pos == doc[0].pos
|
||||||
|
assert new_doc[0].tag == doc[0].tag
|
||||||
|
assert new_doc[0].dep == doc[0].dep
|
||||||
|
assert new_doc[0].head.idx == doc[0].head.idx
|
||||||
|
assert new_doc[0].lemma == doc[0].lemma
|
||||||
|
assert len(new_doc.ents) == 1
|
||||||
|
assert new_doc.ents[0].start == 1
|
||||||
|
assert new_doc.ents[0].end == 2
|
||||||
|
assert new_doc.ents[0].label_ == "ORG"
|
||||||
|
assert doc.to_bytes() == new_doc.to_bytes()
|
||||||
|
|
||||||
|
|
||||||
|
def test_json_to_doc_compat(doc, doc_json):
|
||||||
|
new_doc = Doc(doc.vocab).from_json(doc_json, validate=True)
|
||||||
new_tokens = [token for token in new_doc]
|
new_tokens = [token for token in new_doc]
|
||||||
assert new_doc.text == doc.text == "c d e "
|
assert new_doc.text == doc.text == "c d e "
|
||||||
assert len(new_tokens) == len([token for token in doc]) == 3
|
assert len(new_tokens) == len([token for token in doc]) == 3
|
||||||
|
@ -114,11 +263,8 @@ def test_json_to_doc(doc):
|
||||||
|
|
||||||
|
|
||||||
def test_json_to_doc_underscore(doc):
|
def test_json_to_doc_underscore(doc):
|
||||||
if not Doc.has_extension("json_test1"):
|
Doc.set_extension("json_test1", default=False)
|
||||||
Doc.set_extension("json_test1", default=False)
|
Doc.set_extension("json_test2", default=False)
|
||||||
if not Doc.has_extension("json_test2"):
|
|
||||||
Doc.set_extension("json_test2", default=False)
|
|
||||||
|
|
||||||
doc._.json_test1 = "hello world"
|
doc._.json_test1 = "hello world"
|
||||||
doc._.json_test2 = [1, 2, 3]
|
doc._.json_test2 = [1, 2, 3]
|
||||||
json_doc = doc.to_json(underscore=["json_test1", "json_test2"])
|
json_doc = doc.to_json(underscore=["json_test1", "json_test2"])
|
||||||
|
@ -126,6 +272,34 @@ def test_json_to_doc_underscore(doc):
|
||||||
assert all([new_doc.has_extension(f"json_test{i}") for i in range(1, 3)])
|
assert all([new_doc.has_extension(f"json_test{i}") for i in range(1, 3)])
|
||||||
assert new_doc._.json_test1 == "hello world"
|
assert new_doc._.json_test1 == "hello world"
|
||||||
assert new_doc._.json_test2 == [1, 2, 3]
|
assert new_doc._.json_test2 == [1, 2, 3]
|
||||||
|
assert doc.to_bytes() == new_doc.to_bytes()
|
||||||
|
|
||||||
|
|
||||||
|
def test_json_to_doc_with_token_span_attributes(doc):
|
||||||
|
Doc.set_extension("json_test1", default=False)
|
||||||
|
Doc.set_extension("json_test2", default=False)
|
||||||
|
Token.set_extension("token_test", default=False)
|
||||||
|
Span.set_extension("span_test", default=False)
|
||||||
|
doc._.json_test1 = "hello world"
|
||||||
|
doc._.json_test2 = [1, 2, 3]
|
||||||
|
doc[0:1]._.span_test = "span_attribute"
|
||||||
|
doc[0]._.token_test = 117
|
||||||
|
|
||||||
|
json_doc = doc.to_json(
|
||||||
|
underscore=["json_test1", "json_test2", "token_test", "span_test"]
|
||||||
|
)
|
||||||
|
json_doc = srsly.json_loads(srsly.json_dumps(json_doc))
|
||||||
|
new_doc = Doc(doc.vocab).from_json(json_doc, validate=True)
|
||||||
|
|
||||||
|
assert all([new_doc.has_extension(f"json_test{i}") for i in range(1, 3)])
|
||||||
|
assert new_doc._.json_test1 == "hello world"
|
||||||
|
assert new_doc._.json_test2 == [1, 2, 3]
|
||||||
|
assert new_doc[0]._.token_test == 117
|
||||||
|
assert new_doc[0:1]._.span_test == "span_attribute"
|
||||||
|
assert new_doc.user_data == doc.user_data
|
||||||
|
assert new_doc.to_bytes(exclude=["user_data"]) == doc.to_bytes(
|
||||||
|
exclude=["user_data"]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_json_to_doc_spans(doc):
|
def test_json_to_doc_spans(doc):
|
||||||
|
|
|
@ -692,3 +692,23 @@ def test_span_group_copy(doc):
|
||||||
assert len(doc.spans["test"]) == 3
|
assert len(doc.spans["test"]) == 3
|
||||||
# check that the copy spans were not modified and this is an isolated doc
|
# check that the copy spans were not modified and this is an isolated doc
|
||||||
assert len(doc_copy.spans["test"]) == 2
|
assert len(doc_copy.spans["test"]) == 2
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.issue(11113)
|
||||||
|
def test_span_ent_id(en_tokenizer):
|
||||||
|
doc = en_tokenizer("a b c d")
|
||||||
|
doc.ents = [Span(doc, 1, 3, label="A", span_id="ID0")]
|
||||||
|
span = doc.ents[0]
|
||||||
|
assert doc[1].ent_id_ == "ID0"
|
||||||
|
|
||||||
|
# setting Span.id sets Token.ent_id
|
||||||
|
span.id_ = "ID1"
|
||||||
|
doc.ents = [span]
|
||||||
|
assert doc.ents[0].ent_id_ == "ID1"
|
||||||
|
assert doc[1].ent_id_ == "ID1"
|
||||||
|
|
||||||
|
# Span.ent_id is an alias of Span.id
|
||||||
|
span.ent_id_ = "ID2"
|
||||||
|
doc.ents = [span]
|
||||||
|
assert doc.ents[0].ent_id_ == "ID2"
|
||||||
|
assert doc[1].ent_id_ == "ID2"
|
||||||
|
|
|
@ -7,3 +7,11 @@ import pytest
|
||||||
def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma):
|
def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma):
|
||||||
test_lemma = ko_tokenizer(word)[0].lemma_
|
test_lemma = ko_tokenizer(word)[0].lemma_
|
||||||
assert test_lemma == lemma
|
assert test_lemma == lemma
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"word,lemma", [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", "크"), ("뭡니까", "뭣"), ("됐다", "되")]
|
||||||
|
)
|
||||||
|
def test_ko_lemmatizer_natto_assigns(ko_tokenizer_natto, word, lemma):
|
||||||
|
test_lemma = ko_tokenizer_natto(word)[0].lemma_
|
||||||
|
assert test_lemma == lemma
|
||||||
|
|
|
@ -22,3 +22,23 @@ def test_ko_tokenizer_pickle(ko_tokenizer):
|
||||||
b = pickle.dumps(ko_tokenizer)
|
b = pickle.dumps(ko_tokenizer)
|
||||||
ko_tokenizer_re = pickle.loads(b)
|
ko_tokenizer_re = pickle.loads(b)
|
||||||
assert ko_tokenizer.to_bytes() == ko_tokenizer_re.to_bytes()
|
assert ko_tokenizer.to_bytes() == ko_tokenizer_re.to_bytes()
|
||||||
|
|
||||||
|
|
||||||
|
def test_ko_tokenizer_natto_serialize(ko_tokenizer_natto):
|
||||||
|
tokenizer_bytes = ko_tokenizer_natto.to_bytes()
|
||||||
|
nlp = Korean()
|
||||||
|
nlp.tokenizer.from_bytes(tokenizer_bytes)
|
||||||
|
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||||
|
|
||||||
|
with make_tempdir() as d:
|
||||||
|
file_path = d / "tokenizer"
|
||||||
|
ko_tokenizer_natto.to_disk(file_path)
|
||||||
|
nlp = Korean()
|
||||||
|
nlp.tokenizer.from_disk(file_path)
|
||||||
|
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||||
|
|
||||||
|
|
||||||
|
def test_ko_tokenizer_natto_pickle(ko_tokenizer_natto):
|
||||||
|
b = pickle.dumps(ko_tokenizer_natto)
|
||||||
|
ko_tokenizer_natto_re = pickle.loads(b)
|
||||||
|
assert ko_tokenizer_natto.to_bytes() == ko_tokenizer_natto_re.to_bytes()
|
||||||
|
|
|
@ -19,6 +19,8 @@ POS_TESTS = [("서울 타워 근처에 살고 있습니다.",
|
||||||
"PROPN ADP VERB X NOUN ADV VERB AUX X PUNCT")]
|
"PROPN ADP VERB X NOUN ADV VERB AUX X PUNCT")]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
# tests for ko_tokenizer (default KoreanTokenizer)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
|
@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
|
||||||
def test_ko_tokenizer(ko_tokenizer, text, expected_tokens):
|
def test_ko_tokenizer(ko_tokenizer, text, expected_tokens):
|
||||||
|
@ -44,7 +46,7 @@ def test_ko_tokenizer_pos(ko_tokenizer, text, expected_pos):
|
||||||
assert pos == expected_pos.split()
|
assert pos == expected_pos.split()
|
||||||
|
|
||||||
|
|
||||||
def test_ko_empty_doc(ko_tokenizer):
|
def test_ko_tokenizer_empty_doc(ko_tokenizer):
|
||||||
tokens = ko_tokenizer("")
|
tokens = ko_tokenizer("")
|
||||||
assert len(tokens) == 0
|
assert len(tokens) == 0
|
||||||
|
|
||||||
|
@ -55,6 +57,44 @@ def test_ko_tokenizer_unknown_tag(ko_tokenizer):
|
||||||
assert tokens[1].pos_ == "X"
|
assert tokens[1].pos_ == "X"
|
||||||
|
|
||||||
|
|
||||||
|
# same tests for ko_tokenizer_natto (KoreanNattoTokenizer)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
|
||||||
|
def test_ko_tokenizer_natto(ko_tokenizer_natto, text, expected_tokens):
|
||||||
|
tokens = [token.text for token in ko_tokenizer_natto(text)]
|
||||||
|
assert tokens == expected_tokens.split()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text,expected_tags", TAG_TESTS)
|
||||||
|
def test_ko_tokenizer_natto_tags(ko_tokenizer_natto, text, expected_tags):
|
||||||
|
tags = [token.tag_ for token in ko_tokenizer_natto(text)]
|
||||||
|
assert tags == expected_tags.split()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text,expected_tags", FULL_TAG_TESTS)
|
||||||
|
def test_ko_tokenizer_natto_full_tags(ko_tokenizer_natto, text, expected_tags):
|
||||||
|
tags = ko_tokenizer_natto(text).user_data["full_tags"]
|
||||||
|
assert tags == expected_tags.split()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text,expected_pos", POS_TESTS)
|
||||||
|
def test_ko_tokenizer_natto_pos(ko_tokenizer_natto, text, expected_pos):
|
||||||
|
pos = [token.pos_ for token in ko_tokenizer_natto(text)]
|
||||||
|
assert pos == expected_pos.split()
|
||||||
|
|
||||||
|
|
||||||
|
def test_ko_tokenizer_natto_empty_doc(ko_tokenizer_natto):
|
||||||
|
tokens = ko_tokenizer_natto("")
|
||||||
|
assert len(tokens) == 0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.issue(10535)
|
||||||
|
def test_ko_tokenizer_natto_unknown_tag(ko_tokenizer_natto):
|
||||||
|
tokens = ko_tokenizer_natto("미닛 리피터")
|
||||||
|
assert tokens[1].pos_ == "X"
|
||||||
|
|
||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
SPACY_TOKENIZER_TESTS = [
|
SPACY_TOKENIZER_TESTS = [
|
||||||
("있다.", "있다 ."),
|
("있다.", "있다 ."),
|
||||||
|
|
0
spacy/tests/lang/lg/__init__.py
Normal file
0
spacy/tests/lang/lg/__init__.py
Normal file
15
spacy/tests/lang/lg/test_tokenizer.py
Normal file
15
spacy/tests/lang/lg/test_tokenizer.py
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
LG_BASIC_TOKENIZATION_TESTS = [
|
||||||
|
(
|
||||||
|
"Abooluganda ab’emmamba ababiri",
|
||||||
|
["Abooluganda", "ab’emmamba", "ababiri"],
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text,expected_tokens", LG_BASIC_TOKENIZATION_TESTS)
|
||||||
|
def test_lg_tokenizer_basic(lg_tokenizer, text, expected_tokens):
|
||||||
|
tokens = lg_tokenizer(text)
|
||||||
|
token_list = [token.text for token in tokens if not token.is_space]
|
||||||
|
assert expected_tokens == token_list
|
|
@ -1,5 +1,6 @@
|
||||||
from spacy.tokens import Doc
|
|
||||||
import pytest
|
import pytest
|
||||||
|
from spacy.tokens import Doc
|
||||||
|
from spacy.util import filter_spans
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -207,3 +208,18 @@ def test_chunking(nl_sample, nl_reference_chunking):
|
||||||
"""
|
"""
|
||||||
chunks = [s.text.lower() for s in nl_sample.noun_chunks]
|
chunks = [s.text.lower() for s in nl_sample.noun_chunks]
|
||||||
assert chunks == nl_reference_chunking
|
assert chunks == nl_reference_chunking
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.issue(10846)
|
||||||
|
def test_no_overlapping_chunks(nl_vocab):
|
||||||
|
# fmt: off
|
||||||
|
doc = Doc(
|
||||||
|
nl_vocab,
|
||||||
|
words=["Dit", "programma", "wordt", "beschouwd", "als", "'s", "werelds", "eerste", "computerprogramma"],
|
||||||
|
deps=["det", "nsubj:pass", "aux:pass", "ROOT", "mark", "det", "fixed", "amod", "xcomp"],
|
||||||
|
heads=[1, 3, 3, 3, 8, 8, 5, 8, 3],
|
||||||
|
pos=["DET", "NOUN", "AUX", "VERB", "SCONJ", "DET", "NOUN", "ADJ", "NOUN"],
|
||||||
|
)
|
||||||
|
# fmt: on
|
||||||
|
chunks = list(doc.noun_chunks)
|
||||||
|
assert filter_spans(chunks) == chunks
|
||||||
|
|
|
@ -2,6 +2,9 @@ import pytest
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
||||||
|
|
||||||
|
|
||||||
def test_ru_doc_lemmatization(ru_lemmatizer):
|
def test_ru_doc_lemmatization(ru_lemmatizer):
|
||||||
words = ["мама", "мыла", "раму"]
|
words = ["мама", "мыла", "раму"]
|
||||||
pos = ["NOUN", "VERB", "NOUN"]
|
pos = ["NOUN", "VERB", "NOUN"]
|
||||||
|
|
|
@ -20,7 +20,6 @@ od katerih so te svoboščine odvisne,
|
||||||
assert len(tokens) == 116
|
assert len(tokens) == 116
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_ordinal_number(sl_tokenizer):
|
def test_ordinal_number(sl_tokenizer):
|
||||||
text = "10. decembra 1948"
|
text = "10. decembra 1948"
|
||||||
tokens = sl_tokenizer(text)
|
tokens = sl_tokenizer(text)
|
||||||
|
|
|
@ -26,14 +26,6 @@ def test_attrs_idempotence(text):
|
||||||
assert intify_attrs(int_attrs) == {LEMMA: 10, IS_ALPHA: True}
|
assert intify_attrs(int_attrs) == {LEMMA: 10, IS_ALPHA: True}
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("text", ["dog"])
|
|
||||||
def test_attrs_do_deprecated(text):
|
|
||||||
int_attrs = intify_attrs(
|
|
||||||
{"F": text, "is_alpha": True}, strings_map={text: 10}, _do_deprecated=True
|
|
||||||
)
|
|
||||||
assert int_attrs == {ORTH: 10, IS_ALPHA: True}
|
|
||||||
|
|
||||||
|
|
||||||
def test_attrs_ent_iob_intify():
|
def test_attrs_ent_iob_intify():
|
||||||
int_attrs = intify_attrs({"ENT_IOB": ""})
|
int_attrs = intify_attrs({"ENT_IOB": ""})
|
||||||
assert int_attrs == {ENT_IOB: 0}
|
assert int_attrs == {ENT_IOB: 0}
|
||||||
|
|
|
@ -1,6 +1,10 @@
|
||||||
|
import pytest
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
||||||
|
|
||||||
|
|
||||||
def test_uk_lemmatizer(uk_lemmatizer):
|
def test_uk_lemmatizer(uk_lemmatizer):
|
||||||
"""Check that the default uk lemmatizer runs."""
|
"""Check that the default uk lemmatizer runs."""
|
||||||
doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])
|
doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
import warnings
|
||||||
import srsly
|
import srsly
|
||||||
from mock import Mock
|
from mock import Mock
|
||||||
|
|
||||||
|
@ -344,13 +345,13 @@ def test_phrase_matcher_validation(en_vocab):
|
||||||
matcher.add("TEST1", [doc1])
|
matcher.add("TEST1", [doc1])
|
||||||
with pytest.warns(UserWarning):
|
with pytest.warns(UserWarning):
|
||||||
matcher.add("TEST2", [doc2])
|
matcher.add("TEST2", [doc2])
|
||||||
with pytest.warns(None) as record:
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("error")
|
||||||
matcher.add("TEST3", [doc3])
|
matcher.add("TEST3", [doc3])
|
||||||
assert not record.list
|
|
||||||
matcher = PhraseMatcher(en_vocab, attr="POS", validate=True)
|
matcher = PhraseMatcher(en_vocab, attr="POS", validate=True)
|
||||||
with pytest.warns(None) as record:
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("error")
|
||||||
matcher.add("TEST4", [doc2])
|
matcher.add("TEST4", [doc2])
|
||||||
assert not record.list
|
|
||||||
|
|
||||||
|
|
||||||
def test_attr_validation(en_vocab):
|
def test_attr_validation(en_vocab):
|
||||||
|
|
|
@ -4,8 +4,8 @@ from pathlib import Path
|
||||||
|
|
||||||
def test_build_dependencies():
|
def test_build_dependencies():
|
||||||
# Check that library requirements are pinned exactly the same across different setup files.
|
# Check that library requirements are pinned exactly the same across different setup files.
|
||||||
# TODO: correct checks for numpy rather than ignoring
|
|
||||||
libs_ignore_requirements = [
|
libs_ignore_requirements = [
|
||||||
|
"cython",
|
||||||
"pytest",
|
"pytest",
|
||||||
"pytest-timeout",
|
"pytest-timeout",
|
||||||
"mock",
|
"mock",
|
||||||
|
@ -21,7 +21,7 @@ def test_build_dependencies():
|
||||||
# ignore language-specific packages that shouldn't be installed by all
|
# ignore language-specific packages that shouldn't be installed by all
|
||||||
libs_ignore_setup = [
|
libs_ignore_setup = [
|
||||||
"fugashi",
|
"fugashi",
|
||||||
"natto-py",
|
"mecab-ko",
|
||||||
"pythainlp",
|
"pythainlp",
|
||||||
"sudachipy",
|
"sudachipy",
|
||||||
"sudachidict_core",
|
"sudachidict_core",
|
||||||
|
|
|
@ -1049,6 +1049,10 @@ def test_no_gold_ents(patterns):
|
||||||
for eg in train_examples:
|
for eg in train_examples:
|
||||||
eg.predicted = ruler(eg.predicted)
|
eg.predicted = ruler(eg.predicted)
|
||||||
|
|
||||||
|
# Entity ruler is no longer needed (initialization below wipes out the
|
||||||
|
# patterns and causes warnings)
|
||||||
|
nlp.remove_pipe("entity_ruler")
|
||||||
|
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
# create artificial KB
|
# create artificial KB
|
||||||
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
|
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
|
||||||
|
|
|
@ -659,3 +659,14 @@ def test_multiprocessing_gpu_warning(nlp2, texts):
|
||||||
# Trigger multi-processing.
|
# Trigger multi-processing.
|
||||||
for _ in docs:
|
for _ in docs:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def test_dot_in_factory_names(nlp):
|
||||||
|
Language.component("my_evil_component", func=evil_component)
|
||||||
|
nlp.add_pipe("my_evil_component")
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="not permitted"):
|
||||||
|
Language.component("my.evil.component.v1", func=evil_component)
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="not permitted"):
|
||||||
|
Language.factory("my.evil.component.v1", func=evil_component)
|
||||||
|
|
|
@ -33,6 +33,9 @@ URLS_SHOULD_MATCH = [
|
||||||
"http://userid:password@example.com/",
|
"http://userid:password@example.com/",
|
||||||
"http://142.42.1.1/",
|
"http://142.42.1.1/",
|
||||||
"http://142.42.1.1:8080/",
|
"http://142.42.1.1:8080/",
|
||||||
|
"http://10.140.12.13/foo",
|
||||||
|
"http://10.140.12.13/foo/bar?arg1=baz&arg2=taz",
|
||||||
|
"http://10.1.1.1",
|
||||||
"http://foo.com/blah_(wikipedia)#cite-1",
|
"http://foo.com/blah_(wikipedia)#cite-1",
|
||||||
"http://foo.com/blah_(wikipedia)_blah#cite-1",
|
"http://foo.com/blah_(wikipedia)_blah#cite-1",
|
||||||
"http://foo.com/unicode_(✪)_in_parens",
|
"http://foo.com/unicode_(✪)_in_parens",
|
||||||
|
@ -94,6 +97,7 @@ URLS_SHOULD_NOT_MATCH = [
|
||||||
"http://foo.bar/foo(bar)baz quux",
|
"http://foo.bar/foo(bar)baz quux",
|
||||||
"http://-error-.invalid/",
|
"http://-error-.invalid/",
|
||||||
"http://a.b-.co",
|
"http://a.b-.co",
|
||||||
|
# Loopback and broadcast addresses should be excluded
|
||||||
"http://0.0.0.0",
|
"http://0.0.0.0",
|
||||||
"http://10.1.1.0",
|
"http://10.1.1.0",
|
||||||
"http://10.1.1.255",
|
"http://10.1.1.255",
|
||||||
|
@ -102,7 +106,6 @@ URLS_SHOULD_NOT_MATCH = [
|
||||||
"http://3628126748",
|
"http://3628126748",
|
||||||
"http://.www.foo.bar/",
|
"http://.www.foo.bar/",
|
||||||
"http://.www.foo.bar./",
|
"http://.www.foo.bar./",
|
||||||
"http://10.1.1.1",
|
|
||||||
"NASDAQ:GOOG",
|
"NASDAQ:GOOG",
|
||||||
"http://-a.b.co",
|
"http://-a.b.co",
|
||||||
pytest.param("foo.com", marks=pytest.mark.xfail()),
|
pytest.param("foo.com", marks=pytest.mark.xfail()),
|
||||||
|
|
|
@ -431,3 +431,41 @@ def test_Example_aligned_whitespace(en_vocab):
|
||||||
|
|
||||||
example = Example(predicted, reference)
|
example = Example(predicted, reference)
|
||||||
assert example.get_aligned("TAG", as_string=True) == tags
|
assert example.get_aligned("TAG", as_string=True) == tags
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.issue("11260")
|
||||||
|
def test_issue11260():
|
||||||
|
annots = {
|
||||||
|
"words": ["I", "like", "New", "York", "."],
|
||||||
|
"spans": {
|
||||||
|
"cities": [(7, 15, "LOC", "")],
|
||||||
|
"people": [(0, 1, "PERSON", "")],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
vocab = Vocab()
|
||||||
|
predicted = Doc(vocab, words=annots["words"])
|
||||||
|
example = Example.from_dict(predicted, annots)
|
||||||
|
assert len(example.reference.spans["cities"]) == 1
|
||||||
|
assert len(example.reference.spans["people"]) == 1
|
||||||
|
|
||||||
|
output_dict = example.to_dict()
|
||||||
|
assert "spans" in output_dict["doc_annotation"]
|
||||||
|
assert output_dict["doc_annotation"]["spans"]["cities"] == annots["spans"]["cities"]
|
||||||
|
assert output_dict["doc_annotation"]["spans"]["people"] == annots["spans"]["people"]
|
||||||
|
|
||||||
|
output_example = Example.from_dict(predicted, output_dict)
|
||||||
|
|
||||||
|
assert len(output_example.reference.spans["cities"]) == len(
|
||||||
|
example.reference.spans["cities"]
|
||||||
|
)
|
||||||
|
assert len(output_example.reference.spans["people"]) == len(
|
||||||
|
example.reference.spans["people"]
|
||||||
|
)
|
||||||
|
for span in example.reference.spans["cities"]:
|
||||||
|
assert span.label_ == "LOC"
|
||||||
|
assert span.text == "New York"
|
||||||
|
assert span.start_char == 7
|
||||||
|
for span in example.reference.spans["people"]:
|
||||||
|
assert span.label_ == "PERSON"
|
||||||
|
assert span.text == "I"
|
||||||
|
assert span.start_char == 0
|
||||||
|
|
|
@ -23,11 +23,7 @@ cdef class Tokenizer:
|
||||||
cdef object _infix_finditer
|
cdef object _infix_finditer
|
||||||
cdef object _rules
|
cdef object _rules
|
||||||
cdef PhraseMatcher _special_matcher
|
cdef PhraseMatcher _special_matcher
|
||||||
# TODO convert to bool in v4
|
cdef bint _faster_heuristics
|
||||||
cdef int _faster_heuristics
|
|
||||||
# TODO next one is unused and should be removed in v4
|
|
||||||
# https://github.com/explosion/spaCy/pull/9150
|
|
||||||
cdef int _unused_int2
|
|
||||||
|
|
||||||
cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
|
cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
|
||||||
cdef int _apply_special_cases(self, Doc doc) except -1
|
cdef int _apply_special_cases(self, Doc doc) except -1
|
||||||
|
|
|
@ -8,7 +8,6 @@ from preshed.maps cimport PreshMap
|
||||||
cimport cython
|
cimport cython
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import warnings
|
|
||||||
|
|
||||||
from .tokens.doc cimport Doc
|
from .tokens.doc cimport Doc
|
||||||
from .strings cimport hash_string
|
from .strings cimport hash_string
|
||||||
|
@ -16,9 +15,9 @@ from .lexeme cimport EMPTY_LEXEME
|
||||||
|
|
||||||
from .attrs import intify_attrs
|
from .attrs import intify_attrs
|
||||||
from .symbols import ORTH, NORM
|
from .symbols import ORTH, NORM
|
||||||
from .errors import Errors, Warnings
|
from .errors import Errors
|
||||||
from . import util
|
from . import util
|
||||||
from .util import registry, get_words_and_spaces
|
from .util import get_words_and_spaces
|
||||||
from .attrs import intify_attrs
|
from .attrs import intify_attrs
|
||||||
from .symbols import ORTH
|
from .symbols import ORTH
|
||||||
from .scorer import Scorer
|
from .scorer import Scorer
|
||||||
|
@ -128,10 +127,10 @@ cdef class Tokenizer:
|
||||||
|
|
||||||
property faster_heuristics:
|
property faster_heuristics:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return bool(self._faster_heuristics)
|
return self._faster_heuristics
|
||||||
|
|
||||||
def __set__(self, faster_heuristics):
|
def __set__(self, faster_heuristics):
|
||||||
self._faster_heuristics = bool(faster_heuristics)
|
self._faster_heuristics = faster_heuristics
|
||||||
self._reload_special_cases()
|
self._reload_special_cases()
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
|
@ -582,7 +581,7 @@ cdef class Tokenizer:
|
||||||
substrings (iterable): A sequence of dicts, where each dict describes
|
substrings (iterable): A sequence of dicts, where each dict describes
|
||||||
a token and its attributes.
|
a token and its attributes.
|
||||||
"""
|
"""
|
||||||
attrs = [intify_attrs(spec, _do_deprecated=True) for spec in substrings]
|
attrs = [intify_attrs(spec) for spec in substrings]
|
||||||
orth = "".join([spec[ORTH] for spec in attrs])
|
orth = "".join([spec[ORTH] for spec in attrs])
|
||||||
if chunk != orth:
|
if chunk != orth:
|
||||||
raise ValueError(Errors.E997.format(chunk=chunk, orth=orth, token_attrs=substrings))
|
raise ValueError(Errors.E997.format(chunk=chunk, orth=orth, token_attrs=substrings))
|
||||||
|
@ -650,7 +649,7 @@ cdef class Tokenizer:
|
||||||
url_match = re.compile("a^").match
|
url_match = re.compile("a^").match
|
||||||
special_cases = {}
|
special_cases = {}
|
||||||
for orth, special_tokens in self.rules.items():
|
for orth, special_tokens in self.rules.items():
|
||||||
special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens]
|
special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings) for special_token in special_tokens]
|
||||||
tokens = []
|
tokens = []
|
||||||
for substring in text.split():
|
for substring in text.split():
|
||||||
suffixes = []
|
suffixes = []
|
||||||
|
|
|
@ -809,27 +809,33 @@ cdef class Doc:
|
||||||
self.c[i].ent_iob = 1
|
self.c[i].ent_iob = 1
|
||||||
self.c[i].ent_type = span.label
|
self.c[i].ent_type = span.label
|
||||||
self.c[i].ent_kb_id = span.kb_id
|
self.c[i].ent_kb_id = span.kb_id
|
||||||
# for backwards compatibility in v3, only set ent_id from
|
self.c[i].ent_id = span.id
|
||||||
# span.id if it's set, otherwise don't override
|
|
||||||
self.c[i].ent_id = span.id if span.id else self.c[i].ent_id
|
|
||||||
for span in blocked:
|
for span in blocked:
|
||||||
for i in range(span.start, span.end):
|
for i in range(span.start, span.end):
|
||||||
self.c[i].ent_iob = 3
|
self.c[i].ent_iob = 3
|
||||||
self.c[i].ent_type = 0
|
self.c[i].ent_type = 0
|
||||||
|
self.c[i].ent_kb_id = 0
|
||||||
|
self.c[i].ent_id = 0
|
||||||
for span in missing:
|
for span in missing:
|
||||||
for i in range(span.start, span.end):
|
for i in range(span.start, span.end):
|
||||||
self.c[i].ent_iob = 0
|
self.c[i].ent_iob = 0
|
||||||
self.c[i].ent_type = 0
|
self.c[i].ent_type = 0
|
||||||
|
self.c[i].ent_kb_id = 0
|
||||||
|
self.c[i].ent_id = 0
|
||||||
for span in outside:
|
for span in outside:
|
||||||
for i in range(span.start, span.end):
|
for i in range(span.start, span.end):
|
||||||
self.c[i].ent_iob = 2
|
self.c[i].ent_iob = 2
|
||||||
self.c[i].ent_type = 0
|
self.c[i].ent_type = 0
|
||||||
|
self.c[i].ent_kb_id = 0
|
||||||
|
self.c[i].ent_id = 0
|
||||||
|
|
||||||
# Set tokens outside of all provided spans
|
# Set tokens outside of all provided spans
|
||||||
if default != SetEntsDefault.unmodified:
|
if default != SetEntsDefault.unmodified:
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
if i not in seen_tokens:
|
if i not in seen_tokens:
|
||||||
self.c[i].ent_type = 0
|
self.c[i].ent_type = 0
|
||||||
|
self.c[i].ent_kb_id = 0
|
||||||
|
self.c[i].ent_id = 0
|
||||||
if default == SetEntsDefault.outside:
|
if default == SetEntsDefault.outside:
|
||||||
self.c[i].ent_iob = 2
|
self.c[i].ent_iob = 2
|
||||||
elif default == SetEntsDefault.missing:
|
elif default == SetEntsDefault.missing:
|
||||||
|
@ -1603,13 +1609,30 @@ cdef class Doc:
|
||||||
ents.append(char_span)
|
ents.append(char_span)
|
||||||
self.ents = ents
|
self.ents = ents
|
||||||
|
|
||||||
# Add custom attributes. Note that only Doc extensions are currently considered, Token and Span extensions are
|
# Add custom attributes for the whole Doc object.
|
||||||
# not yet supported.
|
|
||||||
for attr in doc_json.get("_", {}):
|
for attr in doc_json.get("_", {}):
|
||||||
if not Doc.has_extension(attr):
|
if not Doc.has_extension(attr):
|
||||||
Doc.set_extension(attr)
|
Doc.set_extension(attr)
|
||||||
self._.set(attr, doc_json["_"][attr])
|
self._.set(attr, doc_json["_"][attr])
|
||||||
|
|
||||||
|
if doc_json.get("underscore_token", {}):
|
||||||
|
for token_attr in doc_json["underscore_token"]:
|
||||||
|
token_start = doc_json["underscore_token"][token_attr]["token_start"]
|
||||||
|
value = doc_json["underscore_token"][token_attr]["value"]
|
||||||
|
|
||||||
|
if not Token.has_extension(token_attr):
|
||||||
|
Token.set_extension(token_attr)
|
||||||
|
self[token_start]._.set(token_attr, value)
|
||||||
|
|
||||||
|
if doc_json.get("underscore_span", {}):
|
||||||
|
for span_attr in doc_json["underscore_span"]:
|
||||||
|
token_start = doc_json["underscore_span"][span_attr]["token_start"]
|
||||||
|
token_end = doc_json["underscore_span"][span_attr]["token_end"]
|
||||||
|
value = doc_json["underscore_span"][span_attr]["value"]
|
||||||
|
|
||||||
|
if not Span.has_extension(span_attr):
|
||||||
|
Span.set_extension(span_attr)
|
||||||
|
self[token_start:token_end]._.set(span_attr, value)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_json(self, underscore=None):
|
def to_json(self, underscore=None):
|
||||||
|
@ -1651,20 +1674,40 @@ cdef class Doc:
|
||||||
for span_group in self.spans:
|
for span_group in self.spans:
|
||||||
data["spans"][span_group] = []
|
data["spans"][span_group] = []
|
||||||
for span in self.spans[span_group]:
|
for span in self.spans[span_group]:
|
||||||
span_data = {
|
span_data = {"start": span.start_char, "end": span.end_char, "label": span.label_, "kb_id": span.kb_id_}
|
||||||
"start": span.start_char, "end": span.end_char, "label": span.label_, "kb_id": span.kb_id_
|
|
||||||
}
|
|
||||||
data["spans"][span_group].append(span_data)
|
data["spans"][span_group].append(span_data)
|
||||||
|
|
||||||
if underscore:
|
if underscore:
|
||||||
data["_"] = {}
|
user_keys = set()
|
||||||
|
if self.user_data:
|
||||||
|
data["_"] = {}
|
||||||
|
data["underscore_token"] = {}
|
||||||
|
data["underscore_span"] = {}
|
||||||
|
for data_key in self.user_data:
|
||||||
|
if type(data_key) == tuple and len(data_key) >= 4 and data_key[0] == "._.":
|
||||||
|
attr = data_key[1]
|
||||||
|
start = data_key[2]
|
||||||
|
end = data_key[3]
|
||||||
|
if attr in underscore:
|
||||||
|
user_keys.add(attr)
|
||||||
|
value = self.user_data[data_key]
|
||||||
|
if not srsly.is_json_serializable(value):
|
||||||
|
raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
|
||||||
|
# Check if doc attribute
|
||||||
|
if start is None:
|
||||||
|
data["_"][attr] = value
|
||||||
|
# Check if token attribute
|
||||||
|
elif end is None:
|
||||||
|
if attr not in data["underscore_token"]:
|
||||||
|
data["underscore_token"][attr] = {"token_start": start, "value": value}
|
||||||
|
# Else span attribute
|
||||||
|
else:
|
||||||
|
if attr not in data["underscore_span"]:
|
||||||
|
data["underscore_span"][attr] = {"token_start": start, "token_end": end, "value": value}
|
||||||
|
|
||||||
for attr in underscore:
|
for attr in underscore:
|
||||||
if not self.has_extension(attr):
|
if attr not in user_keys:
|
||||||
raise ValueError(Errors.E106.format(attr=attr, opts=underscore))
|
raise ValueError(Errors.E106.format(attr=attr, opts=underscore))
|
||||||
value = self._.get(attr)
|
|
||||||
if not srsly.is_json_serializable(value):
|
|
||||||
raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
|
|
||||||
data["_"][attr] = value
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def to_utf8_array(self, int nr_char=-1):
|
def to_utf8_array(self, int nr_char=-1):
|
||||||
|
|
|
@ -115,17 +115,23 @@ class Span:
|
||||||
end: int
|
end: int
|
||||||
start_char: int
|
start_char: int
|
||||||
end_char: int
|
end_char: int
|
||||||
label: int
|
@property
|
||||||
kb_id: int
|
def label(self) -> int: ...
|
||||||
ent_id: int
|
@property
|
||||||
ent_id_: str
|
def kb_id(self) -> int: ...
|
||||||
@property
|
@property
|
||||||
def id(self) -> int: ...
|
def id(self) -> int: ...
|
||||||
@property
|
@property
|
||||||
def id_(self) -> str: ...
|
def ent_id(self) -> int: ...
|
||||||
@property
|
@property
|
||||||
def orth_(self) -> str: ...
|
def orth_(self) -> str: ...
|
||||||
@property
|
@property
|
||||||
def lemma_(self) -> str: ...
|
def lemma_(self) -> str: ...
|
||||||
label_: str
|
@property
|
||||||
kb_id_: str
|
def label_(self) -> str: ...
|
||||||
|
@property
|
||||||
|
def kb_id_(self) -> str: ...
|
||||||
|
@property
|
||||||
|
def id_(self) -> str: ...
|
||||||
|
@property
|
||||||
|
def ent_id_(self) -> str: ...
|
||||||
|
|
|
@ -802,28 +802,18 @@ cdef class Span:
|
||||||
|
|
||||||
property id:
|
property id:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
cdef SpanC* span_c = self.span_c()
|
return self.span_c().id
|
||||||
return span_c.id
|
|
||||||
|
|
||||||
def __set__(self, attr_t id):
|
def __set__(self, attr_t id):
|
||||||
cdef SpanC* span_c = self.span_c()
|
self.span_c().id = id
|
||||||
span_c.id = id
|
|
||||||
|
|
||||||
property ent_id:
|
property ent_id:
|
||||||
"""RETURNS (uint64): The entity ID."""
|
"""Alias for the span's ID."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.root.ent_id
|
return self.id
|
||||||
|
|
||||||
def __set__(self, hash_t key):
|
def __set__(self, attr_t ent_id):
|
||||||
raise NotImplementedError(Errors.E200.format(attr="ent_id"))
|
self.id = ent_id
|
||||||
|
|
||||||
property ent_id_:
|
|
||||||
"""RETURNS (str): The (string) entity ID."""
|
|
||||||
def __get__(self):
|
|
||||||
return self.root.ent_id_
|
|
||||||
|
|
||||||
def __set__(self, str key):
|
|
||||||
raise NotImplementedError(Errors.E200.format(attr="ent_id_"))
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def orth_(self):
|
def orth_(self):
|
||||||
|
@ -839,7 +829,7 @@ cdef class Span:
|
||||||
return "".join([t.lemma_ + t.whitespace_ for t in self]).strip()
|
return "".join([t.lemma_ + t.whitespace_ for t in self]).strip()
|
||||||
|
|
||||||
property label_:
|
property label_:
|
||||||
"""RETURNS (str): The span's label."""
|
"""The span's label."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.doc.vocab.strings[self.label]
|
return self.doc.vocab.strings[self.label]
|
||||||
|
|
||||||
|
@ -847,7 +837,7 @@ cdef class Span:
|
||||||
self.label = self.doc.vocab.strings.add(label_)
|
self.label = self.doc.vocab.strings.add(label_)
|
||||||
|
|
||||||
property kb_id_:
|
property kb_id_:
|
||||||
"""RETURNS (str): The span's KB ID."""
|
"""The span's KB ID."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.doc.vocab.strings[self.kb_id]
|
return self.doc.vocab.strings[self.kb_id]
|
||||||
|
|
||||||
|
@ -855,13 +845,22 @@ cdef class Span:
|
||||||
self.kb_id = self.doc.vocab.strings.add(kb_id_)
|
self.kb_id = self.doc.vocab.strings.add(kb_id_)
|
||||||
|
|
||||||
property id_:
|
property id_:
|
||||||
"""RETURNS (str): The span's ID."""
|
"""The span's ID."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.doc.vocab.strings[self.id]
|
return self.doc.vocab.strings[self.id]
|
||||||
|
|
||||||
def __set__(self, str id_):
|
def __set__(self, str id_):
|
||||||
self.id = self.doc.vocab.strings.add(id_)
|
self.id = self.doc.vocab.strings.add(id_)
|
||||||
|
|
||||||
|
property ent_id_:
|
||||||
|
"""Alias for the span's ID."""
|
||||||
|
def __get__(self):
|
||||||
|
return self.id_
|
||||||
|
|
||||||
|
def __set__(self, str ent_id_):
|
||||||
|
self.id_ = ent_id_
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
|
cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
|
||||||
# Don't allow spaces to be the root, if there are
|
# Don't allow spaces to be the root, if there are
|
||||||
|
|
|
@ -361,6 +361,7 @@ cdef class Example:
|
||||||
"doc_annotation": {
|
"doc_annotation": {
|
||||||
"cats": dict(self.reference.cats),
|
"cats": dict(self.reference.cats),
|
||||||
"entities": doc_to_biluo_tags(self.reference),
|
"entities": doc_to_biluo_tags(self.reference),
|
||||||
|
"spans": self._spans_to_dict(),
|
||||||
"links": self._links_to_dict()
|
"links": self._links_to_dict()
|
||||||
},
|
},
|
||||||
"token_annotation": {
|
"token_annotation": {
|
||||||
|
@ -376,6 +377,18 @@ cdef class Example:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def _spans_to_dict(self):
|
||||||
|
span_dict = {}
|
||||||
|
for key in self.reference.spans:
|
||||||
|
span_tuples = []
|
||||||
|
for span in self.reference.spans[key]:
|
||||||
|
span_tuple = (span.start_char, span.end_char, span.label_, span.kb_id_)
|
||||||
|
span_tuples.append(span_tuple)
|
||||||
|
span_dict[key] = span_tuples
|
||||||
|
|
||||||
|
return span_dict
|
||||||
|
|
||||||
|
|
||||||
def _links_to_dict(self):
|
def _links_to_dict(self):
|
||||||
links = {}
|
links = {}
|
||||||
for ent in self.reference.ents:
|
for ent in self.reference.ents:
|
||||||
|
|
|
@ -337,3 +337,5 @@ def ensure_shape(vectors_loc):
|
||||||
# store all the results in a list in memory
|
# store all the results in a list in memory
|
||||||
lines2 = open_file(vectors_loc)
|
lines2 = open_file(vectors_loc)
|
||||||
yield from lines2
|
yield from lines2
|
||||||
|
lines2.close()
|
||||||
|
lines.close()
|
||||||
|
|
|
@ -795,6 +795,15 @@ def get_model_lower_version(constraint: str) -> Optional[str]:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def is_prerelease_version(version: str) -> bool:
|
||||||
|
"""Check whether a version is a prerelease version.
|
||||||
|
|
||||||
|
version (str): The version, e.g. "3.0.0.dev1".
|
||||||
|
RETURNS (bool): Whether the version is a prerelease version.
|
||||||
|
"""
|
||||||
|
return Version(version).is_prerelease
|
||||||
|
|
||||||
|
|
||||||
def get_base_version(version: str) -> str:
|
def get_base_version(version: str) -> str:
|
||||||
"""Generate the base version without any prerelease identifiers.
|
"""Generate the base version without any prerelease identifiers.
|
||||||
|
|
||||||
|
|
|
@ -32,7 +32,6 @@ cdef class Vocab:
|
||||||
cdef public object writing_system
|
cdef public object writing_system
|
||||||
cdef public object get_noun_chunks
|
cdef public object get_noun_chunks
|
||||||
cdef readonly int length
|
cdef readonly int length
|
||||||
cdef public object _unused_object # TODO remove in v4, see #9150
|
|
||||||
cdef public object lex_attr_getters
|
cdef public object lex_attr_getters
|
||||||
cdef public object cfg
|
cdef public object cfg
|
||||||
|
|
||||||
|
|
|
@ -72,7 +72,6 @@ def unpickle_vocab(
|
||||||
sstore: StringStore,
|
sstore: StringStore,
|
||||||
vectors: Any,
|
vectors: Any,
|
||||||
morphology: Any,
|
morphology: Any,
|
||||||
_unused_object: Any,
|
|
||||||
lex_attr_getters: Any,
|
lex_attr_getters: Any,
|
||||||
lookups: Any,
|
lookups: Any,
|
||||||
get_noun_chunks: Any,
|
get_noun_chunks: Any,
|
||||||
|
|
|
@ -268,8 +268,7 @@ cdef class Vocab:
|
||||||
cdef int i
|
cdef int i
|
||||||
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
|
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
|
||||||
for i, props in enumerate(substrings):
|
for i, props in enumerate(substrings):
|
||||||
props = intify_attrs(props, strings_map=self.strings,
|
props = intify_attrs(props, strings_map=self.strings)
|
||||||
_do_deprecated=True)
|
|
||||||
token = &tokens[i]
|
token = &tokens[i]
|
||||||
# Set the special tokens up to have arbitrary attributes
|
# Set the special tokens up to have arbitrary attributes
|
||||||
lex = <LexemeC*>self.get_by_orth(self.mem, props[ORTH])
|
lex = <LexemeC*>self.get_by_orth(self.mem, props[ORTH])
|
||||||
|
@ -559,21 +558,18 @@ def pickle_vocab(vocab):
|
||||||
sstore = vocab.strings
|
sstore = vocab.strings
|
||||||
vectors = vocab.vectors
|
vectors = vocab.vectors
|
||||||
morph = vocab.morphology
|
morph = vocab.morphology
|
||||||
_unused_object = vocab._unused_object
|
|
||||||
lex_attr_getters = srsly.pickle_dumps(vocab.lex_attr_getters)
|
lex_attr_getters = srsly.pickle_dumps(vocab.lex_attr_getters)
|
||||||
lookups = vocab.lookups
|
lookups = vocab.lookups
|
||||||
get_noun_chunks = vocab.get_noun_chunks
|
get_noun_chunks = vocab.get_noun_chunks
|
||||||
return (unpickle_vocab,
|
return (unpickle_vocab,
|
||||||
(sstore, vectors, morph, _unused_object, lex_attr_getters, lookups, get_noun_chunks))
|
(sstore, vectors, morph, lex_attr_getters, lookups, get_noun_chunks))
|
||||||
|
|
||||||
|
|
||||||
def unpickle_vocab(sstore, vectors, morphology, _unused_object,
|
def unpickle_vocab(sstore, vectors, morphology, lex_attr_getters, lookups, get_noun_chunks):
|
||||||
lex_attr_getters, lookups, get_noun_chunks):
|
|
||||||
cdef Vocab vocab = Vocab()
|
cdef Vocab vocab = Vocab()
|
||||||
vocab.vectors = vectors
|
vocab.vectors = vectors
|
||||||
vocab.strings = sstore
|
vocab.strings = sstore
|
||||||
vocab.morphology = morphology
|
vocab.morphology = morphology
|
||||||
vocab._unused_object = _unused_object
|
|
||||||
vocab.lex_attr_getters = srsly.pickle_loads(lex_attr_getters)
|
vocab.lex_attr_getters = srsly.pickle_loads(lex_attr_getters)
|
||||||
vocab.lookups = lookups
|
vocab.lookups = lookups
|
||||||
vocab.get_noun_chunks = get_noun_chunks
|
vocab.get_noun_chunks = get_noun_chunks
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
---
|
---
|
||||||
title: AttributeRuler
|
title: AttributeRuler
|
||||||
tag: class
|
tag: class
|
||||||
source: spacy/pipeline/attributeruler.py
|
source: spacy/pipeline/attribute_ruler.py
|
||||||
new: 3
|
new: 3
|
||||||
teaser: 'Pipeline component for rule-based token attribute assignment'
|
teaser: 'Pipeline component for rule-based token attribute assignment'
|
||||||
api_string_name: attribute_ruler
|
api_string_name: attribute_ruler
|
||||||
|
@ -34,7 +34,7 @@ how the component should be configured. You can override its settings via the
|
||||||
| `validate` | Whether patterns should be validated (passed to the `Matcher`). Defaults to `False`. ~~bool~~ |
|
| `validate` | Whether patterns should be validated (passed to the `Matcher`). Defaults to `False`. ~~bool~~ |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
%%GITHUB_SPACY/spacy/pipeline/attributeruler.py
|
%%GITHUB_SPACY/spacy/pipeline/attribute_ruler.py
|
||||||
```
|
```
|
||||||
|
|
||||||
## AttributeRuler.\_\_init\_\_ {#init tag="method"}
|
## AttributeRuler.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
|
@ -395,12 +395,13 @@ file to keep track of your settings and hyperparameters and your own
|
||||||
> "pos": List[str],
|
> "pos": List[str],
|
||||||
> "morphs": List[str],
|
> "morphs": List[str],
|
||||||
> "sent_starts": List[Optional[bool]],
|
> "sent_starts": List[Optional[bool]],
|
||||||
> "deps": List[string],
|
> "deps": List[str],
|
||||||
> "heads": List[int],
|
> "heads": List[int],
|
||||||
> "entities": List[str],
|
> "entities": List[str],
|
||||||
> "entities": List[(int, int, str)],
|
> "entities": List[(int, int, str)],
|
||||||
> "cats": Dict[str, float],
|
> "cats": Dict[str, float],
|
||||||
> "links": Dict[(int, int), dict],
|
> "links": Dict[(int, int), dict],
|
||||||
|
> "spans": Dict[str, List[Tuple]],
|
||||||
> }
|
> }
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
@ -417,9 +418,10 @@ file to keep track of your settings and hyperparameters and your own
|
||||||
| `deps` | List of string values indicating the [dependency relation](/usage/linguistic-features#dependency-parse) of a token to its head. ~~List[str]~~ |
|
| `deps` | List of string values indicating the [dependency relation](/usage/linguistic-features#dependency-parse) of a token to its head. ~~List[str]~~ |
|
||||||
| `heads` | List of integer values indicating the dependency head of each token, referring to the absolute index of each token in the text. ~~List[int]~~ |
|
| `heads` | List of integer values indicating the dependency head of each token, referring to the absolute index of each token in the text. ~~List[int]~~ |
|
||||||
| `entities` | **Option 1:** List of [BILUO tags](/usage/linguistic-features#accessing-ner) per token of the format `"{action}-{label}"`, or `None` for unannotated tokens. ~~List[str]~~ |
|
| `entities` | **Option 1:** List of [BILUO tags](/usage/linguistic-features#accessing-ner) per token of the format `"{action}-{label}"`, or `None` for unannotated tokens. ~~List[str]~~ |
|
||||||
| `entities` | **Option 2:** List of `"(start, end, label)"` tuples defining all entities in the text. ~~List[Tuple[int, int, str]]~~ |
|
| `entities` | **Option 2:** List of `(start_char, end_char, label)` tuples defining all entities in the text. ~~List[Tuple[int, int, str]]~~ |
|
||||||
| `cats` | Dictionary of `label`/`value` pairs indicating how relevant a certain [text category](/api/textcategorizer) is for the text. ~~Dict[str, float]~~ |
|
| `cats` | Dictionary of `label`/`value` pairs indicating how relevant a certain [text category](/api/textcategorizer) is for the text. ~~Dict[str, float]~~ |
|
||||||
| `links` | Dictionary of `offset`/`dict` pairs defining [named entity links](/usage/linguistic-features#entity-linking). The character offsets are linked to a dictionary of relevant knowledge base IDs. ~~Dict[Tuple[int, int], Dict]~~ |
|
| `links` | Dictionary of `offset`/`dict` pairs defining [named entity links](/usage/linguistic-features#entity-linking). The character offsets are linked to a dictionary of relevant knowledge base IDs. ~~Dict[Tuple[int, int], Dict]~~ |
|
||||||
|
| `spans` | Dictionary of `spans_key`/`List[Tuple]` pairs defining the spans for each spans key as `(start_char, end_char, label, kb_id)` tuples. ~~Dict[str, List[Tuple[int, int, str, str]]~~ |
|
||||||
|
|
||||||
<Infobox title="Notes and caveats">
|
<Infobox title="Notes and caveats">
|
||||||
|
|
||||||
|
|
|
@ -158,10 +158,10 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and
|
||||||
## DependencyParser.initialize {#initialize tag="method" new="3"}
|
## DependencyParser.initialize {#initialize tag="method" new="3"}
|
||||||
|
|
||||||
Initialize the component for training. `get_examples` should be a function that
|
Initialize the component for training. `get_examples` should be a function that
|
||||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
returns an iterable of [`Example`](/api/example) objects. **At least one example
|
||||||
used to **initialize the model** of the component and can either be the full
|
should be supplied.** The data examples are used to **initialize the model** of
|
||||||
training data or a representative sample. Initialization includes validating the
|
the component and can either be the full training data or a representative
|
||||||
network,
|
sample. Initialization includes validating the network,
|
||||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
setting up the label scheme based on the data. This method is typically called
|
setting up the label scheme based on the data. This method is typically called
|
||||||
by [`Language.initialize`](/api/language#initialize) and lets you customize
|
by [`Language.initialize`](/api/language#initialize) and lets you customize
|
||||||
|
@ -179,7 +179,7 @@ This method was previously called `begin_training`.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> parser = nlp.add_pipe("parser")
|
> parser = nlp.add_pipe("parser")
|
||||||
> parser.initialize(lambda: [], nlp=nlp)
|
> parser.initialize(lambda: examples, nlp=nlp)
|
||||||
> ```
|
> ```
|
||||||
>
|
>
|
||||||
> ```ini
|
> ```ini
|
||||||
|
@ -193,7 +193,7 @@ This method was previously called `begin_training`.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||||
| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Dict[str, Dict[str, int]]]~~ |
|
| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Dict[str, Dict[str, int]]]~~ |
|
||||||
|
|
|
@ -142,10 +142,10 @@ and [`pipe`](/api/edittreelemmatizer#pipe) delegate to the
|
||||||
## EditTreeLemmatizer.initialize {#initialize tag="method" new="3"}
|
## EditTreeLemmatizer.initialize {#initialize tag="method" new="3"}
|
||||||
|
|
||||||
Initialize the component for training. `get_examples` should be a function that
|
Initialize the component for training. `get_examples` should be a function that
|
||||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
returns an iterable of [`Example`](/api/example) objects. **At least one example
|
||||||
used to **initialize the model** of the component and can either be the full
|
should be supplied.** The data examples are used to **initialize the model** of
|
||||||
training data or a representative sample. Initialization includes validating the
|
the component and can either be the full training data or a representative
|
||||||
network,
|
sample. Initialization includes validating the network,
|
||||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
setting up the label scheme based on the data. This method is typically called
|
setting up the label scheme based on the data. This method is typically called
|
||||||
by [`Language.initialize`](/api/language#initialize) and lets you customize
|
by [`Language.initialize`](/api/language#initialize) and lets you customize
|
||||||
|
@ -157,7 +157,7 @@ config.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> lemmatizer = nlp.add_pipe("trainable_lemmatizer", name="lemmatizer")
|
> lemmatizer = nlp.add_pipe("trainable_lemmatizer", name="lemmatizer")
|
||||||
> lemmatizer.initialize(lambda: [], nlp=nlp)
|
> lemmatizer.initialize(lambda: examples, nlp=nlp)
|
||||||
> ```
|
> ```
|
||||||
>
|
>
|
||||||
> ```ini
|
> ```ini
|
||||||
|
@ -171,7 +171,7 @@ config.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||||
| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ |
|
| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ |
|
||||||
|
|
|
@ -186,10 +186,10 @@ with the current vocab.
|
||||||
## EntityLinker.initialize {#initialize tag="method" new="3"}
|
## EntityLinker.initialize {#initialize tag="method" new="3"}
|
||||||
|
|
||||||
Initialize the component for training. `get_examples` should be a function that
|
Initialize the component for training. `get_examples` should be a function that
|
||||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
returns an iterable of [`Example`](/api/example) objects. **At least one example
|
||||||
used to **initialize the model** of the component and can either be the full
|
should be supplied.** The data examples are used to **initialize the model** of
|
||||||
training data or a representative sample. Initialization includes validating the
|
the component and can either be the full training data or a representative
|
||||||
network,
|
sample. Initialization includes validating the network,
|
||||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
setting up the label scheme based on the data. This method is typically called
|
setting up the label scheme based on the data. This method is typically called
|
||||||
by [`Language.initialize`](/api/language#initialize).
|
by [`Language.initialize`](/api/language#initialize).
|
||||||
|
@ -209,15 +209,15 @@ This method was previously called `begin_training`.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> entity_linker = nlp.add_pipe("entity_linker")
|
> entity_linker = nlp.add_pipe("entity_linker")
|
||||||
> entity_linker.initialize(lambda: [], nlp=nlp, kb_loader=my_kb)
|
> entity_linker.initialize(lambda: examples, nlp=nlp, kb_loader=my_kb)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||||
| `kb_loader` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. ~~Callable[[Vocab], KnowledgeBase]~~ |
|
| `kb_loader` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. ~~Callable[[Vocab], KnowledgeBase]~~ |
|
||||||
|
|
||||||
## EntityLinker.predict {#predict tag="method"}
|
## EntityLinker.predict {#predict tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -154,10 +154,10 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and
|
||||||
## EntityRecognizer.initialize {#initialize tag="method" new="3"}
|
## EntityRecognizer.initialize {#initialize tag="method" new="3"}
|
||||||
|
|
||||||
Initialize the component for training. `get_examples` should be a function that
|
Initialize the component for training. `get_examples` should be a function that
|
||||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
returns an iterable of [`Example`](/api/example) objects. **At least one example
|
||||||
used to **initialize the model** of the component and can either be the full
|
should be supplied.** The data examples are used to **initialize the model** of
|
||||||
training data or a representative sample. Initialization includes validating the
|
the component and can either be the full training data or a representative
|
||||||
network,
|
sample. Initialization includes validating the network,
|
||||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
setting up the label scheme based on the data. This method is typically called
|
setting up the label scheme based on the data. This method is typically called
|
||||||
by [`Language.initialize`](/api/language#initialize) and lets you customize
|
by [`Language.initialize`](/api/language#initialize) and lets you customize
|
||||||
|
@ -175,7 +175,7 @@ This method was previously called `begin_training`.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> ner = nlp.add_pipe("ner")
|
> ner = nlp.add_pipe("ner")
|
||||||
> ner.initialize(lambda: [], nlp=nlp)
|
> ner.initialize(lambda: examples, nlp=nlp)
|
||||||
> ```
|
> ```
|
||||||
>
|
>
|
||||||
> ```ini
|
> ```ini
|
||||||
|
@ -189,7 +189,7 @@ This method was previously called `begin_training`.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||||
| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Dict[str, Dict[str, int]]]~~ |
|
| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Dict[str, Dict[str, int]]]~~ |
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
---
|
---
|
||||||
title: EntityRuler
|
title: EntityRuler
|
||||||
tag: class
|
tag: class
|
||||||
source: spacy/pipeline/entityruler.py
|
source: spacy/pipeline/entity_ruler.py
|
||||||
new: 2.1
|
new: 2.1
|
||||||
teaser: 'Pipeline component for rule-based named entity recognition'
|
teaser: 'Pipeline component for rule-based named entity recognition'
|
||||||
api_string_name: entity_ruler
|
api_string_name: entity_ruler
|
||||||
|
@ -64,7 +64,7 @@ how the component should be configured. You can override its settings via the
|
||||||
| `scorer` | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~ |
|
| `scorer` | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~ |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
%%GITHUB_SPACY/spacy/pipeline/entityruler.py
|
%%GITHUB_SPACY/spacy/pipeline/entity_ruler.py
|
||||||
```
|
```
|
||||||
|
|
||||||
## EntityRuler.\_\_init\_\_ {#init tag="method"}
|
## EntityRuler.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
|
@ -70,7 +70,7 @@ lemmatizer is available. The lemmatizer modes `rule` and `pos_lookup` require
|
||||||
[`token.pos`](/api/token) from a previous pipeline component (see example
|
[`token.pos`](/api/token) from a previous pipeline component (see example
|
||||||
pipeline configurations in the
|
pipeline configurations in the
|
||||||
[pretrained pipeline design details](/models#design-cnn)) or rely on third-party
|
[pretrained pipeline design details](/models#design-cnn)) or rely on third-party
|
||||||
libraries (`pymorphy2`).
|
libraries (`pymorphy3`).
|
||||||
|
|
||||||
| Language | Default Mode |
|
| Language | Default Mode |
|
||||||
| -------- | ------------ |
|
| -------- | ------------ |
|
||||||
|
@ -86,9 +86,9 @@ libraries (`pymorphy2`).
|
||||||
| `nb` | `rule` |
|
| `nb` | `rule` |
|
||||||
| `nl` | `rule` |
|
| `nl` | `rule` |
|
||||||
| `pl` | `pos_lookup` |
|
| `pl` | `pos_lookup` |
|
||||||
| `ru` | `pymorphy2` |
|
| `ru` | `pymorphy3` |
|
||||||
| `sv` | `rule` |
|
| `sv` | `rule` |
|
||||||
| `uk` | `pymorphy2` |
|
| `uk` | `pymorphy3` |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
%%GITHUB_SPACY/spacy/pipeline/lemmatizer.py
|
%%GITHUB_SPACY/spacy/pipeline/lemmatizer.py
|
||||||
|
|
|
@ -148,10 +148,10 @@ applied to the `Doc` in order. Both [`__call__`](/api/morphologizer#call) and
|
||||||
## Morphologizer.initialize {#initialize tag="method"}
|
## Morphologizer.initialize {#initialize tag="method"}
|
||||||
|
|
||||||
Initialize the component for training. `get_examples` should be a function that
|
Initialize the component for training. `get_examples` should be a function that
|
||||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
returns an iterable of [`Example`](/api/example) objects. **At least one example
|
||||||
used to **initialize the model** of the component and can either be the full
|
should be supplied.** The data examples are used to **initialize the model** of
|
||||||
training data or a representative sample. Initialization includes validating the
|
the component and can either be the full training data or a representative
|
||||||
network,
|
sample. Initialization includes validating the network,
|
||||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
setting up the label scheme based on the data. This method is typically called
|
setting up the label scheme based on the data. This method is typically called
|
||||||
by [`Language.initialize`](/api/language#initialize) and lets you customize
|
by [`Language.initialize`](/api/language#initialize) and lets you customize
|
||||||
|
@ -163,7 +163,7 @@ config.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> morphologizer = nlp.add_pipe("morphologizer")
|
> morphologizer = nlp.add_pipe("morphologizer")
|
||||||
> morphologizer.initialize(lambda: [], nlp=nlp)
|
> morphologizer.initialize(lambda: examples, nlp=nlp)
|
||||||
> ```
|
> ```
|
||||||
>
|
>
|
||||||
> ```ini
|
> ```ini
|
||||||
|
@ -177,7 +177,7 @@ config.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||||
| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
|
| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
|
||||||
|
|
|
@ -133,10 +133,10 @@ and [`pipe`](/api/sentencerecognizer#pipe) delegate to the
|
||||||
## SentenceRecognizer.initialize {#initialize tag="method"}
|
## SentenceRecognizer.initialize {#initialize tag="method"}
|
||||||
|
|
||||||
Initialize the component for training. `get_examples` should be a function that
|
Initialize the component for training. `get_examples` should be a function that
|
||||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
returns an iterable of [`Example`](/api/example) objects. **At least one example
|
||||||
used to **initialize the model** of the component and can either be the full
|
should be supplied.** The data examples are used to **initialize the model** of
|
||||||
training data or a representative sample. Initialization includes validating the
|
the component and can either be the full training data or a representative
|
||||||
network,
|
sample. Initialization includes validating the network,
|
||||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
setting up the label scheme based on the data. This method is typically called
|
setting up the label scheme based on the data. This method is typically called
|
||||||
by [`Language.initialize`](/api/language#initialize).
|
by [`Language.initialize`](/api/language#initialize).
|
||||||
|
@ -145,14 +145,14 @@ by [`Language.initialize`](/api/language#initialize).
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> senter = nlp.add_pipe("senter")
|
> senter = nlp.add_pipe("senter")
|
||||||
> senter.initialize(lambda: [], nlp=nlp)
|
> senter.initialize(lambda: examples, nlp=nlp)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||||
|
|
||||||
## SentenceRecognizer.predict {#predict tag="method"}
|
## SentenceRecognizer.predict {#predict tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -561,8 +561,8 @@ overlaps with will be returned.
|
||||||
| `lemma_` | The span's lemma. Equivalent to `"".join(token.text_with_ws for token in span)`. ~~str~~ |
|
| `lemma_` | The span's lemma. Equivalent to `"".join(token.text_with_ws for token in span)`. ~~str~~ |
|
||||||
| `kb_id` | The hash value of the knowledge base ID referred to by the span. ~~int~~ |
|
| `kb_id` | The hash value of the knowledge base ID referred to by the span. ~~int~~ |
|
||||||
| `kb_id_` | The knowledge base ID referred to by the span. ~~str~~ |
|
| `kb_id_` | The knowledge base ID referred to by the span. ~~str~~ |
|
||||||
| `ent_id` | The hash value of the named entity the root token is an instance of. ~~int~~ |
|
| `ent_id` | Alias for `id`: the hash value of the span's ID. ~~int~~ |
|
||||||
| `ent_id_` | The string ID of the named entity the root token is an instance of. ~~str~~ |
|
| `ent_id_` | Alias for `id_`: the span's ID. ~~str~~ |
|
||||||
| `id` | The hash value of the span's ID. ~~int~~ |
|
| `id` | The hash value of the span's ID. ~~int~~ |
|
||||||
| `id_` | The span's ID. ~~str~~ |
|
| `id_` | The span's ID. ~~str~~ |
|
||||||
| `sentiment` | A scalar value indicating the positivity or negativity of the span. ~~float~~ |
|
| `sentiment` | A scalar value indicating the positivity or negativity of the span. ~~float~~ |
|
||||||
|
|
|
@ -148,10 +148,10 @@ applied to the `Doc` in order. Both [`__call__`](/api/spancategorizer#call) and
|
||||||
## SpanCategorizer.initialize {#initialize tag="method"}
|
## SpanCategorizer.initialize {#initialize tag="method"}
|
||||||
|
|
||||||
Initialize the component for training. `get_examples` should be a function that
|
Initialize the component for training. `get_examples` should be a function that
|
||||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
returns an iterable of [`Example`](/api/example) objects. **At least one example
|
||||||
used to **initialize the model** of the component and can either be the full
|
should be supplied.** The data examples are used to **initialize the model** of
|
||||||
training data or a representative sample. Initialization includes validating the
|
the component and can either be the full training data or a representative
|
||||||
network,
|
sample. Initialization includes validating the network,
|
||||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
setting up the label scheme based on the data. This method is typically called
|
setting up the label scheme based on the data. This method is typically called
|
||||||
by [`Language.initialize`](/api/language#initialize) and lets you customize
|
by [`Language.initialize`](/api/language#initialize) and lets you customize
|
||||||
|
@ -163,7 +163,7 @@ config.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> spancat = nlp.add_pipe("spancat")
|
> spancat = nlp.add_pipe("spancat")
|
||||||
> spancat.initialize(lambda: [], nlp=nlp)
|
> spancat.initialize(lambda: examples, nlp=nlp)
|
||||||
> ```
|
> ```
|
||||||
>
|
>
|
||||||
> ```ini
|
> ```ini
|
||||||
|
@ -177,7 +177,7 @@ config.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||||
| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ |
|
| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ |
|
||||||
|
|
|
@ -131,10 +131,10 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and
|
||||||
## Tagger.initialize {#initialize tag="method" new="3"}
|
## Tagger.initialize {#initialize tag="method" new="3"}
|
||||||
|
|
||||||
Initialize the component for training. `get_examples` should be a function that
|
Initialize the component for training. `get_examples` should be a function that
|
||||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
returns an iterable of [`Example`](/api/example) objects. **At least one example
|
||||||
used to **initialize the model** of the component and can either be the full
|
should be supplied.** The data examples are used to **initialize the model** of
|
||||||
training data or a representative sample. Initialization includes validating the
|
the component and can either be the full training data or a representative
|
||||||
network,
|
sample. Initialization includes validating the network,
|
||||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
setting up the label scheme based on the data. This method is typically called
|
setting up the label scheme based on the data. This method is typically called
|
||||||
by [`Language.initialize`](/api/language#initialize) and lets you customize
|
by [`Language.initialize`](/api/language#initialize) and lets you customize
|
||||||
|
@ -152,7 +152,7 @@ This method was previously called `begin_training`.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> tagger = nlp.add_pipe("tagger")
|
> tagger = nlp.add_pipe("tagger")
|
||||||
> tagger.initialize(lambda: [], nlp=nlp)
|
> tagger.initialize(lambda: examples, nlp=nlp)
|
||||||
> ```
|
> ```
|
||||||
>
|
>
|
||||||
> ```ini
|
> ```ini
|
||||||
|
@ -166,7 +166,7 @@ This method was previously called `begin_training`.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||||
| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ |
|
| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ |
|
||||||
|
|
|
@ -84,6 +84,7 @@ architectures and their arguments and hyperparameters.
|
||||||
| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ |
|
| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ |
|
||||||
| `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ |
|
| `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||||
|
| `scorer` | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
%%GITHUB_SPACY/spacy/pipeline/textcat.py
|
%%GITHUB_SPACY/spacy/pipeline/textcat.py
|
||||||
|
@ -176,10 +177,10 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and
|
||||||
## TextCategorizer.initialize {#initialize tag="method" new="3"}
|
## TextCategorizer.initialize {#initialize tag="method" new="3"}
|
||||||
|
|
||||||
Initialize the component for training. `get_examples` should be a function that
|
Initialize the component for training. `get_examples` should be a function that
|
||||||
returns an iterable of [`Example`](/api/example) objects. The data examples are
|
returns an iterable of [`Example`](/api/example) objects. **At least one example
|
||||||
used to **initialize the model** of the component and can either be the full
|
should be supplied.** The data examples are used to **initialize the model** of
|
||||||
training data or a representative sample. Initialization includes validating the
|
the component and can either be the full training data or a representative
|
||||||
network,
|
sample. Initialization includes validating the network,
|
||||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
setting up the label scheme based on the data. This method is typically called
|
setting up the label scheme based on the data. This method is typically called
|
||||||
by [`Language.initialize`](/api/language#initialize) and lets you customize
|
by [`Language.initialize`](/api/language#initialize) and lets you customize
|
||||||
|
@ -197,7 +198,7 @@ This method was previously called `begin_training`.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> textcat = nlp.add_pipe("textcat")
|
> textcat = nlp.add_pipe("textcat")
|
||||||
> textcat.initialize(lambda: [], nlp=nlp)
|
> textcat.initialize(lambda: examples, nlp=nlp)
|
||||||
> ```
|
> ```
|
||||||
>
|
>
|
||||||
> ```ini
|
> ```ini
|
||||||
|
@ -212,7 +213,7 @@ This method was previously called `begin_training`.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||||
| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ |
|
| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ |
|
||||||
|
|
|
@ -127,10 +127,10 @@ and [`set_annotations`](/api/tok2vec#set_annotations) methods.
|
||||||
|
|
||||||
Initialize the component for training and return an
|
Initialize the component for training and return an
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
||||||
function that returns an iterable of [`Example`](/api/example) objects. The data
|
function that returns an iterable of [`Example`](/api/example) objects. **At
|
||||||
examples are used to **initialize the model** of the component and can either be
|
least one example should be supplied.** The data examples are used to
|
||||||
the full training data or a representative sample. Initialization includes
|
**initialize the model** of the component and can either be the full training
|
||||||
validating the network,
|
data or a representative sample. Initialization includes validating the network,
|
||||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
setting up the label scheme based on the data. This method is typically called
|
setting up the label scheme based on the data. This method is typically called
|
||||||
by [`Language.initialize`](/api/language#initialize).
|
by [`Language.initialize`](/api/language#initialize).
|
||||||
|
@ -139,14 +139,14 @@ by [`Language.initialize`](/api/language#initialize).
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> tok2vec = nlp.add_pipe("tok2vec")
|
> tok2vec = nlp.add_pipe("tok2vec")
|
||||||
> tok2vec.initialize(lambda: [], nlp=nlp)
|
> tok2vec.initialize(lambda: examples, nlp=nlp)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||||
|
|
||||||
## Tok2Vec.predict {#predict tag="method"}
|
## Tok2Vec.predict {#predict tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -425,8 +425,8 @@ The L2 norm of the token's vector representation.
|
||||||
| `ent_iob_` | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. ~~str~~ |
|
| `ent_iob_` | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. ~~str~~ |
|
||||||
| `ent_kb_id` <Tag variant="new">2.2</Tag> | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~int~~ |
|
| `ent_kb_id` <Tag variant="new">2.2</Tag> | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~int~~ |
|
||||||
| `ent_kb_id_` <Tag variant="new">2.2</Tag> | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~str~~ |
|
| `ent_kb_id_` <Tag variant="new">2.2</Tag> | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~str~~ |
|
||||||
| `ent_id` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~int~~ |
|
| `ent_id` | ID of the entity the token is an instance of, if any. ~~int~~ |
|
||||||
| `ent_id_` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~str~~ |
|
| `ent_id_` | ID of the entity the token is an instance of, if any. ~~str~~ |
|
||||||
| `lemma` | Base form of the token, with no inflectional suffixes. ~~int~~ |
|
| `lemma` | Base form of the token, with no inflectional suffixes. ~~int~~ |
|
||||||
| `lemma_` | Base form of the token, with no inflectional suffixes. ~~str~~ |
|
| `lemma_` | Base form of the token, with no inflectional suffixes. ~~str~~ |
|
||||||
| `norm` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~int~~ |
|
| `norm` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~int~~ |
|
||||||
|
|
|
@ -240,7 +240,7 @@ browser. Will run a simple web server.
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| --------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| --------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span]], Doc, Span]~~ |
|
| `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span]], Doc, Span]~~ |
|
||||||
| `style` | Visualization style, `"dep"`, `"ent"` or `"span"` <Tag variant="new">3.3</Tag>. Defaults to `"dep"`. ~~str~~ |
|
| `style` | Visualization style, `"dep"`, `"ent"` or `"span"` <Tag variant="new">3.3</Tag>. Defaults to `"dep"`. ~~str~~ |
|
||||||
| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
|
| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
|
||||||
| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ |
|
| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ |
|
||||||
| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ |
|
| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ |
|
||||||
|
@ -265,7 +265,7 @@ Render a dependency parse tree or named entity visualization.
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span, dict]], Doc, Span, dict]~~ |
|
| `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span, dict]], Doc, Span, dict]~~ |
|
||||||
| `style` | Visualization style,`"dep"`, `"ent"` or `"span"` <Tag variant="new">3.3</Tag>. Defaults to `"dep"`. ~~str~~ |
|
| `style` | Visualization style, `"dep"`, `"ent"` or `"span"` <Tag variant="new">3.3</Tag>. Defaults to `"dep"`. ~~str~~ |
|
||||||
| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
|
| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
|
||||||
| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ |
|
| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ |
|
||||||
| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ |
|
| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ |
|
||||||
|
@ -273,6 +273,73 @@ Render a dependency parse tree or named entity visualization.
|
||||||
| `jupyter` | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None` (default). ~~Optional[bool]~~ |
|
| `jupyter` | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None` (default). ~~Optional[bool]~~ |
|
||||||
| **RETURNS** | The rendered HTML markup. ~~str~~ |
|
| **RETURNS** | The rendered HTML markup. ~~str~~ |
|
||||||
|
|
||||||
|
### displacy.parse_deps {#displacy.parse_deps tag="method" new="2"}
|
||||||
|
|
||||||
|
Generate dependency parse in `{'words': [], 'arcs': []}` format.
|
||||||
|
For use with the `manual=True` argument in `displacy.render`.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> import spacy
|
||||||
|
> from spacy import displacy
|
||||||
|
> nlp = spacy.load("en_core_web_sm")
|
||||||
|
> doc = nlp("This is a sentence.")
|
||||||
|
> deps_parse = displacy.parse_deps(doc)
|
||||||
|
> html = displacy.render(deps_parse, style="dep", manual=True)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ------------------------------------------------------------------- |
|
||||||
|
| `orig_doc` | Doc to parse dependencies. ~~Doc~~ |
|
||||||
|
| `options` | Dependency parse specific visualisation options. ~~Dict[str, Any]~~ |
|
||||||
|
| **RETURNS** | Generated dependency parse keyed by words and arcs. ~~dict~~ |
|
||||||
|
|
||||||
|
### displacy.parse_ents {#displacy.parse_ents tag="method" new="2"}
|
||||||
|
|
||||||
|
Generate named entities in `[{start: i, end: i, label: 'label'}]` format.
|
||||||
|
For use with the `manual=True` argument in `displacy.render`.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> import spacy
|
||||||
|
> from spacy import displacy
|
||||||
|
> nlp = spacy.load("en_core_web_sm")
|
||||||
|
> doc = nlp("But Google is starting from behind.")
|
||||||
|
> ents_parse = displacy.parse_ents(doc)
|
||||||
|
> html = displacy.render(ents_parse, style="ent", manual=True)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ------------------------------------------------------------------- |
|
||||||
|
| `doc` | Doc to parse entities. ~~Doc~~ |
|
||||||
|
| `options` | NER-specific visualisation options. ~~Dict[str, Any]~~ |
|
||||||
|
| **RETURNS** | Generated entities keyed by text (original text) and ents. ~~dict~~ |
|
||||||
|
|
||||||
|
### displacy.parse_spans {#displacy.parse_spans tag="method" new="2"}
|
||||||
|
|
||||||
|
Generate spans in `[{start_token: i, end_token: i, label: 'label'}]` format.
|
||||||
|
For use with the `manual=True` argument in `displacy.render`.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> import spacy
|
||||||
|
> from spacy import displacy
|
||||||
|
> nlp = spacy.load("en_core_web_sm")
|
||||||
|
> doc = nlp("But Google is starting from behind.")
|
||||||
|
> doc.spans['orgs'] = [doc[1:2]]
|
||||||
|
> ents_parse = displacy.parse_spans(doc, options={"spans_key" : "orgs"})
|
||||||
|
> html = displacy.render(ents_parse, style="span", manual=True)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ------------------------------------------------------------------- |
|
||||||
|
| `doc` | Doc to parse entities. ~~Doc~~ |
|
||||||
|
| `options` | Span-specific visualisation options. ~~Dict[str, Any]~~ |
|
||||||
|
| **RETURNS** | Generated entities keyed by text (original text) and ents. ~~dict~~ |
|
||||||
|
|
||||||
### Visualizer options {#displacy_options}
|
### Visualizer options {#displacy_options}
|
||||||
|
|
||||||
The `options` argument lets you specify additional settings for each visualizer.
|
The `options` argument lets you specify additional settings for each visualizer.
|
||||||
|
|
|
@ -175,10 +175,10 @@ applied to the `Doc` in order. Both [`__call__`](/api/transformer#call) and
|
||||||
|
|
||||||
Initialize the component for training and return an
|
Initialize the component for training and return an
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
||||||
function that returns an iterable of [`Example`](/api/example) objects. The data
|
function that returns an iterable of [`Example`](/api/example) objects. **At
|
||||||
examples are used to **initialize the model** of the component and can either be
|
least one example should be supplied.** The data examples are used to
|
||||||
the full training data or a representative sample. Initialization includes
|
**initialize the model** of the component and can either be the full training
|
||||||
validating the network,
|
data or a representative sample. Initialization includes validating the network,
|
||||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
setting up the label scheme based on the data. This method is typically called
|
setting up the label scheme based on the data. This method is typically called
|
||||||
by [`Language.initialize`](/api/language#initialize).
|
by [`Language.initialize`](/api/language#initialize).
|
||||||
|
@ -187,14 +187,14 @@ by [`Language.initialize`](/api/language#initialize).
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> trf = nlp.add_pipe("transformer")
|
> trf = nlp.add_pipe("transformer")
|
||||||
> trf.initialize(lambda: iter([]), nlp=nlp)
|
> trf.initialize(lambda: examples, nlp=nlp)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||||
|
|
||||||
## Transformer.predict {#predict tag="method"}
|
## Transformer.predict {#predict tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -11,8 +11,8 @@ menu:
|
||||||
- ['Tokenization', 'tokenization']
|
- ['Tokenization', 'tokenization']
|
||||||
- ['Merging & Splitting', 'retokenization']
|
- ['Merging & Splitting', 'retokenization']
|
||||||
- ['Sentence Segmentation', 'sbd']
|
- ['Sentence Segmentation', 'sbd']
|
||||||
- ['Vectors & Similarity', 'vectors-similarity']
|
|
||||||
- ['Mappings & Exceptions', 'mappings-exceptions']
|
- ['Mappings & Exceptions', 'mappings-exceptions']
|
||||||
|
- ['Vectors & Similarity', 'vectors-similarity']
|
||||||
- ['Language Data', 'language-data']
|
- ['Language Data', 'language-data']
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -268,18 +268,49 @@ used for training the current [Japanese pipelines](/models/ja).
|
||||||
|
|
||||||
### Korean language support {#korean}
|
### Korean language support {#korean}
|
||||||
|
|
||||||
> #### mecab-ko tokenizer
|
There are currently three built-in options for Korean tokenization, two based on
|
||||||
|
[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md) and one
|
||||||
|
using the rule-based tokenizer.
|
||||||
|
|
||||||
|
> #### Default mecab-ko tokenizer
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
|
> # uses mecab-ko-dic
|
||||||
> nlp = spacy.blank("ko")
|
> nlp = spacy.blank("ko")
|
||||||
|
>
|
||||||
|
> # with custom mecab args
|
||||||
|
> mecab_args = "-d /path/to/dicdir -u /path/to/userdic"
|
||||||
|
> config = {"nlp": {"tokenizer": {"mecab_args": mecab_args}}}
|
||||||
|
> nlp = spacy.blank("ko", config=config)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
The default MeCab-based Korean tokenizer requires:
|
The default MeCab-based Korean tokenizer requires the python package
|
||||||
|
[`mecab-ko`](https://pypi.org/project/mecab-ko/) and no further system
|
||||||
|
requirements.
|
||||||
|
|
||||||
|
The `natto-py` MeCab-based tokenizer (the previous default for spaCy v3.4 and
|
||||||
|
earlier) is available as `spacy.KoreanNattoTokenizer.v1`. It requires:
|
||||||
|
|
||||||
- [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md)
|
- [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md)
|
||||||
- [mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic)
|
- [mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic)
|
||||||
- [natto-py](https://github.com/buruzaemon/natto-py)
|
- [natto-py](https://github.com/buruzaemon/natto-py)
|
||||||
|
|
||||||
|
To use this tokenizer, edit `[nlp.tokenizer]` in your config:
|
||||||
|
|
||||||
|
> #### natto-py MeCab-ko tokenizer
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> config = {"nlp": {"tokenizer": {"@tokenizers": "spacy.KoreanNattoTokenizer.v1"}}}
|
||||||
|
> nlp = spacy.blank("ko", config=config)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
```ini
|
||||||
|
### config.cfg
|
||||||
|
[nlp]
|
||||||
|
lang = "ko"
|
||||||
|
tokenizer = {"@tokenizers" = "spacy.KoreanNattoTokenizer.v1"}
|
||||||
|
```
|
||||||
|
|
||||||
For some Korean datasets and tasks, the
|
For some Korean datasets and tasks, the
|
||||||
[rule-based tokenizer](/usage/linguistic-features#tokenization) is better-suited
|
[rule-based tokenizer](/usage/linguistic-features#tokenization) is better-suited
|
||||||
than MeCab. To configure a Korean pipeline with the rule-based tokenizer:
|
than MeCab. To configure a Korean pipeline with the rule-based tokenizer:
|
||||||
|
|
|
@ -1367,14 +1367,14 @@ patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"},
|
||||||
ruler.add_patterns(patterns)
|
ruler.add_patterns(patterns)
|
||||||
|
|
||||||
doc1 = nlp("Apple is opening its first big office in San Francisco.")
|
doc1 = nlp("Apple is opening its first big office in San Francisco.")
|
||||||
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc1.ents])
|
print([(ent.text, ent.label_, ent.id_) for ent in doc1.ents])
|
||||||
|
|
||||||
doc2 = nlp("Apple is opening its first big office in San Fran.")
|
doc2 = nlp("Apple is opening its first big office in San Fran.")
|
||||||
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc2.ents])
|
print([(ent.text, ent.label_, ent.id_) for ent in doc2.ents])
|
||||||
```
|
```
|
||||||
|
|
||||||
If the `id` attribute is included in the [`EntityRuler`](/api/entityruler)
|
If the `id` attribute is included in the [`EntityRuler`](/api/entityruler)
|
||||||
patterns, the `ent_id_` property of the matched entity is set to the `id` given
|
patterns, the `id_` property of the matched entity is set to the `id` given
|
||||||
in the patterns. So in the example above it's easy to identify that "San
|
in the patterns. So in the example above it's easy to identify that "San
|
||||||
Francisco" and "San Fran" are both the same entity.
|
Francisco" and "San Fran" are both the same entity.
|
||||||
|
|
||||||
|
|
|
@ -195,7 +195,7 @@ the data to and from a JSON file.
|
||||||
>
|
>
|
||||||
> To see custom serialization methods in action, check out the new
|
> To see custom serialization methods in action, check out the new
|
||||||
> [`EntityRuler`](/api/entityruler) component and its
|
> [`EntityRuler`](/api/entityruler) component and its
|
||||||
> [source](%%GITHUB_SPACY/spacy/pipeline/entityruler.py). Patterns added to the
|
> [source](%%GITHUB_SPACY/spacy/pipeline/entity_ruler.py). Patterns added to the
|
||||||
> component will be saved to a `.jsonl` file if the pipeline is serialized to
|
> component will be saved to a `.jsonl` file if the pipeline is serialized to
|
||||||
> disk, and to a bytestring if the pipeline is serialized to bytes. This allows
|
> disk, and to a bytestring if the pipeline is serialized to bytes. This allows
|
||||||
> saving out a pipeline with a rule-based entity recognizer and including all
|
> saving out a pipeline with a rule-based entity recognizer and including all
|
||||||
|
|
|
@ -198,12 +198,12 @@ import DisplacySpanHtml from 'images/displacy-span.html'
|
||||||
|
|
||||||
The span visualizer lets you customize the following `options`:
|
The span visualizer lets you customize the following `options`:
|
||||||
|
|
||||||
| Argument | Description |
|
| Argument | Description |
|
||||||
|-----------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|
|
| ----------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `spans_key` | Which spans key to render spans from. Default is `"sc"`. ~~str~~ |
|
| `spans_key` | Which spans key to render spans from. Default is `"sc"`. ~~str~~ |
|
||||||
| `templates` | Dictionary containing the keys `"span"`, `"slice"`, and `"start"`. These dictate how the overall span, a span slice, and the starting token will be rendered. ~~Optional[Dict[str, str]~~ |
|
| `templates` | Dictionary containing the keys `"span"`, `"slice"`, and `"start"`. These dictate how the overall span, a span slice, and the starting token will be rendered. ~~Optional[Dict[str, str]~~ |
|
||||||
| `kb_url_template` | Optional template to construct the KB url for the entity to link to. Expects a python f-string format with single field to fill in ~~Optional[str]~~ |
|
| `kb_url_template` | Optional template to construct the KB url for the entity to link to. Expects a python f-string format with single field to fill in ~~Optional[str]~~ |
|
||||||
| `colors` | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~ |
|
| `colors` | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~ |
|
||||||
|
|
||||||
Because spans can be stored across different keys in `doc.spans`, you need to specify
|
Because spans can be stored across different keys in `doc.spans`, you need to specify
|
||||||
which one displaCy should use with `spans_key` (`sc` is the default).
|
which one displaCy should use with `spans_key` (`sc` is the default).
|
||||||
|
@ -343,9 +343,21 @@ want to visualize output from other libraries, like [NLTK](http://www.nltk.org)
|
||||||
or
|
or
|
||||||
[SyntaxNet](https://github.com/tensorflow/models/tree/master/research/syntaxnet).
|
[SyntaxNet](https://github.com/tensorflow/models/tree/master/research/syntaxnet).
|
||||||
If you set `manual=True` on either `render()` or `serve()`, you can pass in data
|
If you set `manual=True` on either `render()` or `serve()`, you can pass in data
|
||||||
in displaCy's format as a dictionary (instead of `Doc` objects).
|
in displaCy's format as a dictionary (instead of `Doc` objects). There are helper
|
||||||
|
functions for converting `Doc` objects to displaCy's format for use with `manual=True`:
|
||||||
|
[`displacy.parse_deps`](/api/top-level#displacy.parse_deps),
|
||||||
|
[`displacy.parse_ents`](/api/top-level#displacy.parse_ents),
|
||||||
|
and [`displacy.parse_spans`](/api/top-level#displacy.parse_spans).
|
||||||
|
|
||||||
> #### Example
|
> #### Example with parse function
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> doc = nlp("But Google is starting from behind.")
|
||||||
|
> ex = displacy.parse_ents(doc)
|
||||||
|
> html = displacy.render(ex, style="ent", manual=True)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
> #### Example with raw data
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> ex = [{"text": "But Google is starting from behind.",
|
> ex = [{"text": "But Google is starting from behind.",
|
||||||
|
@ -354,6 +366,7 @@ in displaCy's format as a dictionary (instead of `Doc` objects).
|
||||||
> html = displacy.render(ex, style="ent", manual=True)
|
> html = displacy.render(ex, style="ent", manual=True)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### DEP input
|
### DEP input
|
||||||
{
|
{
|
||||||
|
@ -389,6 +402,18 @@ in displaCy's format as a dictionary (instead of `Doc` objects).
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
```python
|
||||||
|
### SPANS input
|
||||||
|
{
|
||||||
|
"text": "Welcome to the Bank of China.",
|
||||||
|
"spans": [
|
||||||
|
{"start_token": 3, "end_token": 6, "label": "ORG"},
|
||||||
|
{"start_token": 5, "end_token": 6, "label": "GPE"},
|
||||||
|
],
|
||||||
|
"tokens": ["Welcome", "to", "the", "Bank", "of", "China", "."],
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
## Using displaCy in a web application {#webapp}
|
## Using displaCy in a web application {#webapp}
|
||||||
|
|
||||||
If you want to use the visualizers as part of a web application, for example to
|
If you want to use the visualizers as part of a web application, for example to
|
||||||
|
|
|
@ -265,6 +265,11 @@
|
||||||
"name": "Luxembourgish",
|
"name": "Luxembourgish",
|
||||||
"has_examples": true
|
"has_examples": true
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"code": "lg",
|
||||||
|
"name": "Luganda",
|
||||||
|
"has_examples": true
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"code": "lij",
|
"code": "lij",
|
||||||
"name": "Ligurian",
|
"name": "Ligurian",
|
||||||
|
@ -369,8 +374,8 @@
|
||||||
"has_examples": true,
|
"has_examples": true,
|
||||||
"dependencies": [
|
"dependencies": [
|
||||||
{
|
{
|
||||||
"name": "pymorphy2",
|
"name": "pymorphy3",
|
||||||
"url": "https://github.com/kmike/pymorphy2"
|
"url": "https://github.com/no-plagiarism/pymorphy3"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"models": [
|
"models": [
|
||||||
|
@ -467,10 +472,20 @@
|
||||||
"code": "uk",
|
"code": "uk",
|
||||||
"name": "Ukrainian",
|
"name": "Ukrainian",
|
||||||
"has_examples": true,
|
"has_examples": true,
|
||||||
|
"models": [
|
||||||
|
"uk_core_news_sm",
|
||||||
|
"uk_core_news_md",
|
||||||
|
"uk_core_news_lg",
|
||||||
|
"uk_core_news_trf"
|
||||||
|
],
|
||||||
"dependencies": [
|
"dependencies": [
|
||||||
{
|
{
|
||||||
"name": "pymorphy2",
|
"name": "pymorphy3",
|
||||||
"url": "https://github.com/kmike/pymorphy2"
|
"url": "https://github.com/no-plagiarism/pymorphy3"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "pymorphy3-dicts-uk",
|
||||||
|
"url": "https://github.com/no-plagiarism/pymorphy3-dicts"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
|
|
@ -1,5 +1,39 @@
|
||||||
{
|
{
|
||||||
"resources": [
|
"resources": [
|
||||||
|
{
|
||||||
|
"id": "concepcy",
|
||||||
|
"title": "concepCy",
|
||||||
|
"slogan": "A multilingual knowledge graph in spaCy",
|
||||||
|
"description": "A spaCy wrapper for ConceptNet, a freely-available semantic network designed to help computers understand the meaning of words.",
|
||||||
|
"github": "JulesBelveze/concepcy",
|
||||||
|
"pip": "concepcy",
|
||||||
|
"code_example": [
|
||||||
|
"import spacy",
|
||||||
|
"import concepcy",
|
||||||
|
"",
|
||||||
|
"nlp = spacy.load('en_core_web_sm')",
|
||||||
|
"# Using default concepCy configuration",
|
||||||
|
"nlp.add_pipe('concepcy')",
|
||||||
|
"",
|
||||||
|
"doc = nlp('WHO is a lovely company')",
|
||||||
|
"",
|
||||||
|
"# Access all the 'RelatedTo' relations from the Doc",
|
||||||
|
"for word, relations in doc._.relatedto.items():",
|
||||||
|
" print(f'Word: {word}\n{relations}')",
|
||||||
|
"",
|
||||||
|
"# Access the 'RelatedTo' relations word by word",
|
||||||
|
"for token in doc:",
|
||||||
|
" print(f'Word: {token}\n{token._.relatedto}')"
|
||||||
|
],
|
||||||
|
"category": ["pipeline"],
|
||||||
|
"image": "https://github.com/JulesBelveze/concepcy/blob/main/figures/concepcy.png",
|
||||||
|
"tags": ["semantic", "ConceptNet"],
|
||||||
|
"author": "Jules Belveze",
|
||||||
|
"author_links": {
|
||||||
|
"github": "JulesBelveze",
|
||||||
|
"website": "https://www.linkedin.com/in/jules-belveze/"
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"id": "spacyfishing",
|
"id": "spacyfishing",
|
||||||
"title": "spaCy fishing",
|
"title": "spaCy fishing",
|
||||||
|
|
|
@ -114,7 +114,11 @@ function formatVectors(data) {
|
||||||
if (!data) return 'n/a'
|
if (!data) return 'n/a'
|
||||||
if (Object.values(data).every(n => n === 0)) return 'context vectors only'
|
if (Object.values(data).every(n => n === 0)) return 'context vectors only'
|
||||||
const { keys, vectors, width } = data
|
const { keys, vectors, width } = data
|
||||||
return `${abbrNum(keys)} keys, ${abbrNum(vectors)} unique vectors (${width} dimensions)`
|
if (keys >= 0) {
|
||||||
|
return `${abbrNum(keys)} keys, ${abbrNum(vectors)} unique vectors (${width} dimensions)`
|
||||||
|
} else {
|
||||||
|
return `${abbrNum(vectors)} floret vectors (${width} dimensions)`
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function formatAccuracy(data, lang) {
|
function formatAccuracy(data, lang) {
|
||||||
|
|
Loading…
Reference in New Issue
Block a user