Merge remote-tracking branch 'upstream/v4' into store-activations

This commit is contained in:
Daniël de Kok 2022-08-30 10:11:04 +02:00
commit 3937abd2e7
91 changed files with 1805 additions and 689 deletions

View File

@ -54,12 +54,12 @@ steps:
condition: eq(${{ parameters.gpu }}, true)
- script: |
${{ parameters.prefix }} python -m pytest --pyargs spacy
${{ parameters.prefix }} python -m pytest --pyargs spacy -W error
displayName: "Run CPU tests"
condition: eq(${{ parameters.gpu }}, false)
- script: |
${{ parameters.prefix }} python -m pytest --pyargs spacy -p spacy.tests.enable_gpu
${{ parameters.prefix }} python -m pytest --pyargs spacy -W error -p spacy.tests.enable_gpu
displayName: "Run GPU tests"
condition: eq(${{ parameters.gpu }}, true)

View File

@ -1,13 +0,0 @@
# Configuration for probot-no-response - https://github.com/probot/no-response
# Number of days of inactivity before an Issue is closed for lack of response
daysUntilClose: 14
# Label requiring a response
responseRequiredLabel: more-info-needed
# Comment to post when closing an Issue for lack of response. Set to `false` to disable
closeComment: >
This issue has been automatically closed because there has been no response
to a request for more information from the original author. With only the
information that is currently in the issue, there's not enough information
to take action. If you're the original author, feel free to reopen the issue
if you have or find the answers needed to investigate further.

View File

@ -15,7 +15,7 @@ jobs:
issue-manager:
runs-on: ubuntu-latest
steps:
- uses: tiangolo/issue-manager@0.2.1
- uses: tiangolo/issue-manager@0.4.0
with:
token: ${{ secrets.GITHUB_TOKEN }}
config: >
@ -25,5 +25,11 @@ jobs:
"message": "This issue has been automatically closed because it was answered and there was no follow-up discussion.",
"remove_label_on_comment": true,
"remove_label_on_close": true
},
"more-info-needed": {
"delay": "P7D",
"message": "This issue has been automatically closed because there has been no response to a request for more information from the original author. With only the information that is currently in the issue, there's not enough information to take action. If you're the original author, feel free to reopen the issue if you have or find the answers needed to investigate further.",
"remove_label_on_comment": true,
"remove_label_on_close": true
}
}

View File

@ -32,7 +32,7 @@ jobs:
versionSpec: "3.7"
- script: |
pip install flake8==3.9.2
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
displayName: "flake8"
- job: "Test"

View File

@ -191,6 +191,8 @@ def load_model(name: str) -> "Language":
...
```
Note that we typically put the `from typing` import statements on the first line(s) of the Python module.
## Structuring logic
### Positional and keyword arguments
@ -275,6 +277,27 @@ If you have to use `try`/`except`, make sure to only include what's **absolutely
+ return [v.strip() for v in value.split(",")]
```
### Numeric comparisons
For numeric comparisons, as a general rule we always use `<` and `>=` and avoid the usage of `<=` and `>`. This is to ensure we consistently
apply inclusive lower bounds and exclusive upper bounds, helping to prevent off-by-one errors.
One exception to this rule is the ternary case. With a chain like
```python
if value >= 0 and value < max:
...
```
it's fine to rewrite this to the shorter form
```python
if 0 <= value < max:
...
```
even though this requires the usage of the `<=` operator.
### Iteration and comprehensions
We generally avoid using built-in functions like `filter` or `map` in favor of list or generator comprehensions.
@ -451,7 +474,7 @@ spaCy uses the [`pytest`](http://doc.pytest.org/) framework for testing. Tests f
When adding tests, make sure to use descriptive names and only test for one behavior at a time. Tests should be grouped into modules dedicated to the same type of functionality and some test modules are organized as directories of test files related to the same larger area of the library, e.g. `matcher` or `tokenizer`.
Regression tests are tests that refer to bugs reported in specific issues. They should live in the relevant module of the test suite, named according to the issue number (e.g., `test_issue1234.py`), and [marked](https://docs.pytest.org/en/6.2.x/example/markers.html#working-with-custom-markers) appropriately (e.g. `@pytest.mark.issue(1234)`). This system allows us to relate tests for specific bugs back to the original reported issue, which is especially useful if we introduce a regression and a previously passing regression tests suddenly fails again. When fixing a bug, it's often useful to create a regression test for it first.
Regression tests are tests that refer to bugs reported in specific issues. They should live in the relevant module of the test suite, named according to the issue number (e.g., `test_issue1234.py`), and [marked](https://docs.pytest.org/en/6.2.x/example/markers.html#working-with-custom-markers) appropriately (e.g. `@pytest.mark.issue(1234)`). This system allows us to relate tests for specific bugs back to the original reported issue, which is especially useful if we introduce a regression and a previously passing regression tests suddenly fails again. When fixing a bug, it's often useful to create a regression test for it first.
The test suite also provides [fixtures](https://github.com/explosion/spaCy/blob/master/spacy/tests/conftest.py) for different language tokenizers that can be used as function arguments of the same name and will be passed in automatically. Those should only be used for tests related to those specific languages. We also have [test utility functions](https://github.com/explosion/spaCy/blob/master/spacy/tests/util.py) for common operations, like creating a temporary file.

View File

@ -31,14 +31,6 @@ project_urls =
zip_safe = false
include_package_data = true
python_requires = >=3.6
setup_requires =
cython>=0.25,<3.0
numpy>=1.15.0
# We also need our Cython packages here to compile against
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0
thinc>=8.1.0,<8.2.0
install_requires =
# Our libraries
spacy-legacy>=3.0.9,<3.1.0
@ -114,7 +106,7 @@ ja =
sudachipy>=0.5.2,!=0.6.1
sudachidict_core>=20211220
ko =
natto-py>=0.9.0
mecab-ko>=1.0.0
th =
pythainlp>=2.0

View File

@ -97,7 +97,7 @@ NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
locals().update(IDS)
def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
def intify_attrs(stringy_attrs, strings_map=None):
"""
Normalize a dictionary of attributes, converting them to ints.
@ -109,75 +109,6 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
converted to ints.
"""
inty_attrs = {}
if _do_deprecated:
if "F" in stringy_attrs:
stringy_attrs["ORTH"] = stringy_attrs.pop("F")
if "L" in stringy_attrs:
stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
if "pos" in stringy_attrs:
stringy_attrs["TAG"] = stringy_attrs.pop("pos")
if "morph" in stringy_attrs:
morphs = stringy_attrs.pop("morph")
if "number" in stringy_attrs:
stringy_attrs.pop("number")
if "tenspect" in stringy_attrs:
stringy_attrs.pop("tenspect")
morph_keys = [
"PunctType",
"PunctSide",
"Other",
"Degree",
"AdvType",
"Number",
"VerbForm",
"PronType",
"Aspect",
"Tense",
"PartType",
"Poss",
"Hyph",
"ConjType",
"NumType",
"Foreign",
"VerbType",
"NounType",
"Gender",
"Mood",
"Negative",
"Tense",
"Voice",
"Abbr",
"Derivation",
"Echo",
"Foreign",
"NameType",
"NounType",
"NumForm",
"NumValue",
"PartType",
"Polite",
"StyleVariant",
"PronType",
"AdjType",
"Person",
"Variant",
"AdpType",
"Reflex",
"Negative",
"Mood",
"Aspect",
"Case",
"Polarity",
"PrepCase",
"Animacy", # U20
]
for key in morph_keys:
if key in stringy_attrs:
stringy_attrs.pop(key)
elif key.lower() in stringy_attrs:
stringy_attrs.pop(key.lower())
elif key.upper() in stringy_attrs:
stringy_attrs.pop(key.upper())
for name, value in stringy_attrs.items():
int_key = intify_attr(name)
if int_key is not None:

View File

@ -7,6 +7,7 @@ import typer
from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX
from .. import about
from ..util import is_package, get_minor_version, run_command
from ..util import is_prerelease_version
from ..errors import OLD_MODEL_SHORTCUTS
@ -74,7 +75,10 @@ def download(model: str, direct: bool = False, sdist: bool = False, *pip_args) -
def get_compatibility() -> dict:
version = get_minor_version(about.__version__)
if is_prerelease_version(about.__version__):
version: Optional[str] = about.__version__
else:
version = get_minor_version(about.__version__)
r = requests.get(about.__compatibility__)
if r.status_code != 200:
msg.fail(

View File

@ -123,7 +123,8 @@ def app(environ, start_response):
def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
"""Generate dependency parse in {'words': [], 'arcs': []} format.
doc (Doc): Document do parse.
orig_doc (Doc): Document to parse.
options (Dict[str, Any]): Dependency parse specific visualisation options.
RETURNS (dict): Generated dependency parse keyed by words and arcs.
"""
doc = Doc(orig_doc.vocab).from_bytes(
@ -209,7 +210,7 @@ def parse_ents(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
def parse_spans(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
"""Generate spans in [{start: i, end: i, label: 'label'}] format.
"""Generate spans in [{start_token: i, end_token: i, label: 'label'}] format.
doc (Doc): Document to parse.
options (Dict[str, any]): Span-specific visualisation options.

View File

@ -16,8 +16,8 @@ def setup_default_warnings():
filter_warning("ignore", error_msg="numpy.dtype size changed") # noqa
filter_warning("ignore", error_msg="numpy.ufunc size changed") # noqa
# warn about entity_ruler & matcher having no patterns only once
for pipe in ["matcher", "entity_ruler"]:
# warn about entity_ruler, span_ruler & matcher having no patterns only once
for pipe in ["matcher", "entity_ruler", "span_ruler"]:
filter_warning("once", error_msg=Warnings.W036.format(name=pipe))
# warn once about lemmatizer without required POS
@ -390,7 +390,7 @@ class Errors(metaclass=ErrorsWithCodes):
"consider using doc.spans instead.")
E106 = ("Can't find `doc._.{attr}` attribute specified in the underscore "
"settings: {opts}")
E107 = ("Value of `doc._.{attr}` is not JSON-serializable: {value}")
E107 = ("Value of custom attribute `{attr}` is not JSON-serializable: {value}")
E109 = ("Component '{name}' could not be run. Did you forget to "
"call `initialize()`?")
E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
@ -536,11 +536,12 @@ class Errors(metaclass=ErrorsWithCodes):
E198 = ("Unable to return {n} most similar vectors for the current vectors "
"table, which contains {n_rows} vectors.")
E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.")
E200 = ("Can't yet set {attr} from Span. Vote for this feature on the "
"issue tracker: http://github.com/explosion/spaCy/issues")
E200 = ("Can't set {attr} from Span.")
E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
# New errors added in v3.x
E853 = ("Unsupported component factory name '{name}'. The character '.' is "
"not permitted in factory names.")
E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not "
"permit overlapping spans.")
E855 = ("Invalid {obj}: {obj} is not from the same doc.")

View File

@ -18,34 +18,23 @@ DEFAULT_CONFIG = """
[nlp.tokenizer]
@tokenizers = "spacy.ko.KoreanTokenizer"
mecab_args = ""
"""
@registry.tokenizers("spacy.ko.KoreanTokenizer")
def create_tokenizer():
def create_tokenizer(mecab_args: str):
def korean_tokenizer_factory(nlp):
return KoreanTokenizer(nlp.vocab)
return KoreanTokenizer(nlp.vocab, mecab_args=mecab_args)
return korean_tokenizer_factory
class KoreanTokenizer(DummyTokenizer):
def __init__(self, vocab: Vocab):
def __init__(self, vocab: Vocab, *, mecab_args: str = ""):
self.vocab = vocab
self._mecab = try_mecab_import() # type: ignore[func-returns-value]
self._mecab_tokenizer = None
@property
def mecab_tokenizer(self):
# This is a property so that initializing a pipeline with blank:ko is
# possible without actually requiring mecab-ko, e.g. to run
# `spacy init vectors ko` for a pipeline that will have a different
# tokenizer in the end. The languages need to match for the vectors
# to be imported and there's no way to pass a custom config to
# `init vectors`.
if self._mecab_tokenizer is None:
self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]")
return self._mecab_tokenizer
mecab = try_mecab_import()
self.mecab_tokenizer = mecab.Tagger(mecab_args)
def __reduce__(self):
return KoreanTokenizer, (self.vocab,)
@ -68,13 +57,15 @@ class KoreanTokenizer(DummyTokenizer):
def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]:
# 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3],
# 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], *
for node in self.mecab_tokenizer.parse(text, as_nodes=True):
if node.is_eos():
for line in self.mecab_tokenizer.parse(text).split("\n"):
if line == "EOS":
break
surface = node.surface
feature = node.feature
tag, _, expr = feature.partition(",")
lemma, _, remainder = expr.partition("/")
surface, _, expr = line.partition("\t")
features = expr.split("/")[0].split(",")
tag = features[0]
lemma = "*"
if len(features) >= 8:
lemma = features[7]
if lemma == "*":
lemma = surface
yield {"surface": surface, "lemma": lemma, "tag": tag}
@ -97,20 +88,94 @@ class Korean(Language):
Defaults = KoreanDefaults
def try_mecab_import() -> None:
def try_mecab_import():
try:
from natto import MeCab
import mecab_ko as MeCab
return MeCab
except ImportError:
raise ImportError(
'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires '
"[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
"[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
"and [natto-py](https://github.com/buruzaemon/natto-py)"
"the python package `mecab-ko`: pip install mecab-ko"
) from None
@registry.tokenizers("spacy.KoreanNattoTokenizer.v1")
def create_natto_tokenizer():
def korean_natto_tokenizer_factory(nlp):
return KoreanNattoTokenizer(nlp.vocab)
return korean_natto_tokenizer_factory
class KoreanNattoTokenizer(DummyTokenizer):
def __init__(self, vocab: Vocab):
self.vocab = vocab
self._mecab = self._try_mecab_import() # type: ignore[func-returns-value]
self._mecab_tokenizer = None
@property
def mecab_tokenizer(self):
# This is a property so that initializing a pipeline with blank:ko is
# possible without actually requiring mecab-ko, e.g. to run
# `spacy init vectors ko` for a pipeline that will have a different
# tokenizer in the end. The languages need to match for the vectors
# to be imported and there's no way to pass a custom config to
# `init vectors`.
if self._mecab_tokenizer is None:
self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]")
return self._mecab_tokenizer
def __reduce__(self):
return KoreanNattoTokenizer, (self.vocab,)
def __call__(self, text: str) -> Doc:
dtokens = list(self.detailed_tokens(text))
surfaces = [dt["surface"] for dt in dtokens]
doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces)))
for token, dtoken in zip(doc, dtokens):
first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미)
if token.tag_ in TAG_MAP:
token.pos = TAG_MAP[token.tag_][POS]
else:
token.pos = X
token.lemma_ = dtoken["lemma"]
doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
return doc
def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]:
# 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3],
# 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], *
for node in self.mecab_tokenizer.parse(text, as_nodes=True):
if node.is_eos():
break
surface = node.surface
feature = node.feature
tag, _, expr = feature.partition(",")
lemma, _, remainder = expr.partition("/")
if lemma == "*" or lemma == "":
lemma = surface
yield {"surface": surface, "lemma": lemma, "tag": tag}
def score(self, examples):
validate_examples(examples, "KoreanTokenizer.score")
return Scorer.score_tokenization(examples)
def _try_mecab_import(self):
try:
from natto import MeCab
return MeCab
except ImportError:
raise ImportError(
'The Korean Natto tokenizer ("spacy.ko.KoreanNattoTokenizer") requires '
"[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
"[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
"and [natto-py](https://github.com/buruzaemon/natto-py)"
) from None
def check_spaces(text, tokens):
prev_end = -1
start = 0

View File

@ -3,7 +3,7 @@ from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES
_infixes = (
["·", "", "\(", "\)"]
["·", "", r"\(", r"\)"]
+ [r"(?<=[0-9])~(?=[0-9-])"]
+ LIST_QUOTES
+ BASE_TOKENIZER_INFIXES

18
spacy/lang/lg/__init__.py Normal file
View File

@ -0,0 +1,18 @@
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES
from ...language import Language, BaseDefaults
class LugandaDefaults(BaseDefaults):
lex_attr_getters = LEX_ATTRS
infixes = TOKENIZER_INFIXES
stop_words = STOP_WORDS
class Luganda(Language):
lang = "lg"
Defaults = LugandaDefaults
__all__ = ["Luganda"]

17
spacy/lang/lg/examples.py Normal file
View File

@ -0,0 +1,17 @@
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.lg.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Mpa ebyafaayo ku byalo Nakatu ne Nkajja",
"Okuyita Ttembo kitegeeza kugwa ddalu",
"Ekifumu kino kyali kya mulimu ki?",
"Ekkovu we liyise wayitibwa mukululo",
"Akola mulimu ki oguvaamu ssente?",
"Emisumaali egikomerera embaawo giyitibwa nninga",
"Abooluganda abemmamba ababiri",
"Ekisaawe ky'ebyenjigiriza kya mugaso nnyo",
]

View File

@ -0,0 +1,95 @@
from ...attrs import LIKE_NUM
_num_words = [
"nnooti", # Zero
"zeero", # zero
"emu", # one
"bbiri", # two
"ssatu", # three
"nnya", # four
"ttaano", # five
"mukaaga", # six
"musanvu", # seven
"munaana", # eight
"mwenda", # nine
"kkumi", # ten
"kkumi n'emu", # eleven
"kkumi na bbiri", # twelve
"kkumi na ssatu", # thirteen
"kkumi na nnya", # forteen
"kkumi na ttaano", # fifteen
"kkumi na mukaaga", # sixteen
"kkumi na musanvu", # seventeen
"kkumi na munaana", # eighteen
"kkumi na mwenda", # nineteen
"amakumi abiri", # twenty
"amakumi asatu", # thirty
"amakumi ana", # forty
"amakumi ataano", # fifty
"nkaaga", # sixty
"nsanvu", # seventy
"kinaana", # eighty
"kyenda", # ninety
"kikumi", # hundred
"lukumi", # thousand
"kakadde", # million
"kawumbi", # billion
"kase", # trillion
"katabalika", # quadrillion
"keesedde", # gajillion
"kafukunya", # bazillion
"ekisooka", # first
"ekyokubiri", # second
"ekyokusatu", # third
"ekyokuna", # fourth
"ekyokutaano", # fifith
"ekyomukaaga", # sixth
"ekyomusanvu", # seventh
"eky'omunaana", # eighth
"ekyomwenda", # nineth
"ekyekkumi", # tenth
"ekyekkumi n'ekimu", # eleventh
"ekyekkumi n'ebibiri", # twelveth
"ekyekkumi n'ebisatu", # thirteenth
"ekyekkumi n'ebina", # fourteenth
"ekyekkumi n'ebitaano", # fifteenth
"ekyekkumi n'omukaaga", # sixteenth
"ekyekkumi n'omusanvu", # seventeenth
"ekyekkumi n'omunaana", # eigteenth
"ekyekkumi n'omwenda", # nineteenth
"ekyamakumi abiri", # twentieth
"ekyamakumi asatu", # thirtieth
"ekyamakumi ana", # fortieth
"ekyamakumi ataano", # fiftieth
"ekyenkaaga", # sixtieth
"ekyensanvu", # seventieth
"ekyekinaana", # eightieth
"ekyekyenda", # ninetieth
"ekyekikumi", # hundredth
"ekyolukumi", # thousandth
"ekyakakadde", # millionth
"ekyakawumbi", # billionth
"ekyakase", # trillionth
"ekyakatabalika", # quadrillionth
"ekyakeesedde", # gajillionth
"ekyakafukunya", # bazillionth
]
def like_num(text):
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
text = text.replace(",", "").replace(".", "")
if text.isdigit():
return True
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
text_lower = text.lower()
if text_lower in _num_words:
return True
return False
LEX_ATTRS = {LIKE_NUM: like_num}

View File

@ -0,0 +1,19 @@
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
_infixes = (
LIST_ELLIPSES
+ LIST_ICONS
+ [
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
]
)
TOKENIZER_INFIXES = _infixes

View File

@ -0,0 +1,19 @@
STOP_WORDS = set(
"""
abadde abalala abamu abangi abava ajja ali alina ani anti ateekeddwa atewamu
atya awamu aweebwa ayinza ba baali babadde babalina bajja
bajjanewankubade bali balina bandi bangi bano bateekeddwa baweebwa bayina bebombi beera bibye
bimu bingi bino bo bokka bonna buli bulijjo bulungi bwabwe bwaffe bwayo bwe bwonna bya byabwe
byaffe byebimu byonna ddaa ddala ddi e ebimu ebiri ebweruobulungi ebyo edda ejja ekirala ekyo
endala engeri ennyo era erimu erina ffe ffenna ga gujja gumu gunno guno gwa gwe kaseera kati
kennyini ki kiki kikino kikye kikyo kino kirungi kki ku kubangabyombi kubangaolwokuba kudda
kuva kuwa kwegamba kyaffe kye kyekimuoyo kyekyo kyonna leero liryo lwa lwaki lyabwezaabwe
lyaffe lyange mbadde mingi mpozzi mu mulinaoyina munda mwegyabwe nolwekyo nabadde nabo nandiyagadde
nandiye nanti naye ne nedda neera nga nnyingi nnyini nnyinza nnyo nti nyinza nze oba ojja okudda
okugenda okuggyako okutuusa okuva okuwa oli olina oluvannyuma olwekyobuva omuli ono osobola otya
oyina oyo seetaaga si sinakindi singa talina tayina tebaali tebaalina tebayina terina tetulina
tetuteekeddwa tewali teyalina teyayina tolina tu tuyina tulina tuyina twafuna twetaaga wa wabula
wabweru wadde waggulunnina wakati waliwobangi waliyo wandi wange wano wansi weebwa yabadde yaffe
ye yenna yennyini yina yonna ziba zijja zonna
""".split()
)

View File

@ -40,6 +40,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
span_label = doc.vocab.strings.add("NP")
# Only NOUNS and PRONOUNS matter
end_span = -1
for i, word in enumerate(filter(lambda x: x.pos in [PRON, NOUN], doclike)):
# For NOUNS
# Pick children from syntactic parse (only those with certain dependencies)
@ -58,15 +59,17 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
children_i = [c.i for c in children] + [word.i]
start_span = min(children_i)
end_span = max(children_i) + 1
yield start_span, end_span, span_label
if start_span >= end_span:
end_span = max(children_i) + 1
yield start_span, end_span, span_label
# PRONOUNS only if it is the subject of a verb
elif word.pos == PRON:
if word.dep in pronoun_deps:
start_span = word.i
end_span = word.i + 1
yield start_span, end_span, span_label
if start_span >= end_span:
end_span = word.i + 1
yield start_span, end_span, span_label
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}

View File

@ -28,7 +28,7 @@ class Russian(Language):
assigns=["token.lemma"],
default_config={
"model": None,
"mode": "pymorphy2",
"mode": "pymorphy3",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},

View File

@ -19,7 +19,7 @@ class RussianLemmatizer(Lemmatizer):
model: Optional[Model],
name: str = "lemmatizer",
*,
mode: str = "pymorphy2",
mode: str = "pymorphy3",
overwrite: bool = False,
scorer: Optional[Callable] = lemmatizer_score,
) -> None:
@ -33,6 +33,16 @@ class RussianLemmatizer(Lemmatizer):
) from None
if getattr(self, "_morph", None) is None:
self._morph = MorphAnalyzer()
elif mode == "pymorphy3":
try:
from pymorphy3 import MorphAnalyzer
except ImportError:
raise ImportError(
"The Russian lemmatizer mode 'pymorphy3' requires the "
"pymorphy3 library. Install it with: pip install pymorphy3"
) from None
if getattr(self, "_morph", None) is None:
self._morph = MorphAnalyzer()
super().__init__(
vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
@ -104,6 +114,9 @@ class RussianLemmatizer(Lemmatizer):
return [analyses[0].normal_form]
return [string]
def pymorphy3_lemmatize(self, token: Token) -> List[str]:
return self.pymorphy2_lemmatize(token)
def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:
gram_map = {

View File

@ -1,9 +1,17 @@
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ...language import Language, BaseDefaults
class SlovenianDefaults(BaseDefaults):
stop_words = STOP_WORDS
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
lex_attr_getters = LEX_ATTRS
class Slovenian(Language):

145
spacy/lang/sl/lex_attrs.py Normal file
View File

@ -0,0 +1,145 @@
from ...attrs import LIKE_NUM
from ...attrs import IS_CURRENCY
import unicodedata
_num_words = set(
"""
nula ničla nič ena dva tri štiri pet šest sedem osem
devet deset enajst dvanajst trinajst štirinajst petnajst
šestnajst sedemnajst osemnajst devetnajst dvajset trideset štirideset
petdeset šestdest sedemdeset osemdeset devedeset sto tisoč
milijon bilijon trilijon kvadrilijon nešteto
en eden enega enemu ennem enim enih enima enimi ene eni eno
dveh dvema dvem dvoje trije treh trem tremi troje štirje štirih štirim štirimi
petih petim petimi šestih šestim šestimi sedmih sedmim sedmimi osmih osmim osmimi
devetih devetim devetimi desetih desetim desetimi enajstih enajstim enajstimi
dvanajstih dvanajstim dvanajstimi trinajstih trinajstim trinajstimi
šestnajstih šestnajstim šestnajstimi petnajstih petnajstim petnajstimi
sedemnajstih sedemnajstim sedemnajstimi osemnajstih osemnajstim osemnajstimi
devetnajstih devetnajstim devetnajstimi dvajsetih dvajsetim dvajsetimi
""".split()
)
_ordinal_words = set(
"""
prvi drugi tretji četrti peti šesti sedmi osmi
deveti deseti enajsti dvanajsti trinajsti štirinajsti
petnajsti šestnajsti sedemnajsti osemnajsti devetnajsti
dvajseti trideseti štirideseti petdeseti šestdeseti sedemdeseti
osemdeseti devetdeseti stoti tisoči milijonti bilijonti
trilijonti kvadrilijonti nešteti
prva druga tretja četrta peta šesta sedma osma
deveta deseta enajsta dvanajsta trinajsta štirnajsta
petnajsta šestnajsta sedemnajsta osemnajsta devetnajsta
dvajseta trideseta štirideseta petdeseta šestdeseta sedemdeseta
osemdeseta devetdeseta stota tisoča milijonta bilijonta
trilijonta kvadrilijonta nešteta
prvo drugo tretje četrto peto šestro sedmo osmo
deveto deseto enajsto dvanajsto trinajsto štirnajsto
petnajsto šestnajsto sedemnajsto osemnajsto devetnajsto
dvajseto trideseto štirideseto petdeseto šestdeseto sedemdeseto
osemdeseto devetdeseto stoto tisočo milijonto bilijonto
trilijonto kvadrilijonto nešteto
prvega drugega tretjega četrtega petega šestega sedmega osmega
devega desetega enajstega dvanajstega trinajstega štirnajstega
petnajstega šestnajstega sedemnajstega osemnajstega devetnajstega
dvajsetega tridesetega štiridesetega petdesetega šestdesetega sedemdesetega
osemdesetega devetdesetega stotega tisočega milijontega bilijontega
trilijontega kvadrilijontega neštetega
prvemu drugemu tretjemu četrtemu petemu šestemu sedmemu osmemu devetemu desetemu
enajstemu dvanajstemu trinajstemu štirnajstemu petnajstemu šestnajstemu sedemnajstemu
osemnajstemu devetnajstemu dvajsetemu tridesetemu štiridesetemu petdesetemu šestdesetemu
sedemdesetemu osemdesetemu devetdesetemu stotemu tisočemu milijontemu bilijontemu
trilijontemu kvadrilijontemu neštetemu
prvem drugem tretjem četrtem petem šestem sedmem osmem devetem desetem
enajstem dvanajstem trinajstem štirnajstem petnajstem šestnajstem sedemnajstem
osemnajstem devetnajstem dvajsetem tridesetem štiridesetem petdesetem šestdesetem
sedemdesetem osemdesetem devetdesetem stotem tisočem milijontem bilijontem
trilijontem kvadrilijontem neštetem
prvim drugim tretjim četrtim petim šestim sedtim osmim devetim desetim
enajstim dvanajstim trinajstim štirnajstim petnajstim šestnajstim sedemnajstim
osemnajstim devetnajstim dvajsetim tridesetim štiridesetim petdesetim šestdesetim
sedemdesetim osemdesetim devetdesetim stotim tisočim milijontim bilijontim
trilijontim kvadrilijontim neštetim
prvih drugih tretjih četrthih petih šestih sedmih osmih deveth desetih
enajstih dvanajstih trinajstih štirnajstih petnajstih šestnajstih sedemnajstih
osemnajstih devetnajstih dvajsetih tridesetih štiridesetih petdesetih šestdesetih
sedemdesetih osemdesetih devetdesetih stotih tisočih milijontih bilijontih
trilijontih kvadrilijontih nešteth
prvima drugima tretjima četrtima petima šestima sedmima osmima devetima desetima
enajstima dvanajstima trinajstima štirnajstima petnajstima šestnajstima sedemnajstima
osemnajstima devetnajstima dvajsetima tridesetima štiridesetima petdesetima šestdesetima
sedemdesetima osemdesetima devetdesetima stotima tisočima milijontima bilijontima
trilijontima kvadrilijontima neštetima
prve druge četrte pete šeste sedme osme devete desete
enajste dvanajste trinajste štirnajste petnajste šestnajste sedemnajste
osemnajste devetnajste dvajsete tridesete štiridesete petdesete šestdesete
sedemdesete osemdesete devetdesete stote tisoče milijonte bilijonte
trilijonte kvadrilijonte neštete
prvimi drugimi tretjimi četrtimi petimi šestimi sedtimi osmimi devetimi desetimi
enajstimi dvanajstimi trinajstimi štirnajstimi petnajstimi šestnajstimi sedemnajstimi
osemnajstimi devetnajstimi dvajsetimi tridesetimi štiridesetimi petdesetimi šestdesetimi
sedemdesetimi osemdesetimi devetdesetimi stotimi tisočimi milijontimi bilijontimi
trilijontimi kvadrilijontimi neštetimi
""".split()
)
_currency_words = set(
"""
evro evra evru evrom evrov evroma evrih evrom evre evri evr eur
cent centa centu cenom centov centoma centih centom cente centi
dolar dolarja dolarji dolarju dolarjem dolarjev dolarjema dolarjih dolarje usd
tolar tolarja tolarji tolarju tolarjem tolarjev tolarjema tolarjih tolarje tol
dinar dinarja dinarji dinarju dinarjem dinarjev dinarjema dinarjih dinarje din
funt funta funti funtu funtom funtov funtoma funtih funte gpb
forint forinta forinti forintu forintom forintov forintoma forintih forinte
zlot zlota zloti zlotu zlotom zlotov zlotoma zlotih zlote
rupij rupija rupiji rupiju rupijem rupijev rupijema rupijih rupije
jen jena jeni jenu jenom jenov jenoma jenih jene
kuna kuni kune kuno kun kunama kunah kunam kunami
marka marki marke markama markah markami
""".split()
)
def like_num(text):
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
text = text.replace(",", "").replace(".", "")
if text.isdigit():
return True
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
text_lower = text.lower()
if text_lower in _num_words:
return True
if text_lower in _ordinal_words:
return True
return False
def is_currency(text):
text_lower = text.lower()
if text in _currency_words:
return True
for char in text:
if unicodedata.category(char) != "Sc":
return False
return True
LEX_ATTRS = {LIKE_NUM: like_num, IS_CURRENCY: is_currency}

View File

@ -0,0 +1,84 @@
from ..char_classes import (
LIST_ELLIPSES,
LIST_ICONS,
HYPHENS,
LIST_PUNCT,
LIST_QUOTES,
CURRENCY,
UNITS,
PUNCT,
LIST_CURRENCY,
CONCAT_QUOTES,
)
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
from ..char_classes import merge_chars
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
INCLUDE_SPECIAL = ["\\+", "\\/", "\\", "\\¯", "\\=", "\\×"] + HYPHENS.split("|")
_prefixes = INCLUDE_SPECIAL + BASE_TOKENIZER_PREFIXES
_suffixes = (
INCLUDE_SPECIAL
+ LIST_PUNCT
+ LIST_ELLIPSES
+ LIST_QUOTES
+ LIST_ICONS
+ [
r"(?<=°[FfCcKk])\.",
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
r"(?<=[0-9])(?:{u})".format(u=UNITS),
r"(?<=[{al}{e}{p}(?:{q})])\.".format(
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
),
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
# split initials like J.K. Rowling
r"(?<=[A-Z]\.)(?:[A-Z].)",
]
)
# a list of all suffixes following a hyphen that are shouldn't split (eg. BTC-jev)
# source: Obeliks tokenizer - https://github.com/clarinsi/obeliks/blob/master/obeliks/res/TokRulesPart1.txt
CONCAT_QUOTES = CONCAT_QUOTES.replace("'", "")
HYPHENS_PERMITTED = (
"((a)|(evemu)|(evskega)|(i)|(jevega)|(jevska)|(jevskimi)|(jinemu)|(oma)|(ovim)|"
"(ovski)|(e)|(evi)|(evskem)|(ih)|(jevem)|(jevske)|(jevsko)|(jini)|(ov)|(ovima)|"
"(ovskih)|(em)|(evih)|(evskemu)|(ja)|(jevemu)|(jevskega)|(ji)|(jinih)|(ova)|"
"(ovimi)|(ovskim)|(ema)|(evim)|(evski)|(je)|(jevi)|(jevskem)|(jih)|(jinim)|"
"(ove)|(ovo)|(ovskima)|(ev)|(evima)|(evskih)|(jem)|(jevih)|(jevskemu)|(jin)|"
"(jinima)|(ovega)|(ovska)|(ovskimi)|(eva)|(evimi)|(evskim)|(jema)|(jevim)|"
"(jevski)|(jina)|(jinimi)|(ovem)|(ovske)|(ovsko)|(eve)|(evo)|(evskima)|(jev)|"
"(jevima)|(jevskih)|(jine)|(jino)|(ovemu)|(ovskega)|(u)|(evega)|(evska)|"
"(evskimi)|(jeva)|(jevimi)|(jevskim)|(jinega)|(ju)|(ovi)|(ovskem)|(evem)|"
"(evske)|(evsko)|(jeve)|(jevo)|(jevskima)|(jinem)|(om)|(ovih)|(ovskemu)|"
"(ovec)|(ovca)|(ovcu)|(ovcem)|(ovcev)|(ovcema)|(ovcih)|(ovci)|(ovce)|(ovcimi)|"
"(evec)|(evca)|(evcu)|(evcem)|(evcev)|(evcema)|(evcih)|(evci)|(evce)|(evcimi)|"
"(jevec)|(jevca)|(jevcu)|(jevcem)|(jevcev)|(jevcema)|(jevcih)|(jevci)|(jevce)|"
"(jevcimi)|(ovka)|(ovke)|(ovki)|(ovko)|(ovk)|(ovkama)|(ovkah)|(ovkam)|(ovkami)|"
"(evka)|(evke)|(evki)|(evko)|(evk)|(evkama)|(evkah)|(evkam)|(evkami)|(jevka)|"
"(jevke)|(jevki)|(jevko)|(jevk)|(jevkama)|(jevkah)|(jevkam)|(jevkami)|(timi)|"
"(im)|(ima)|(a)|(imi)|(e)|(o)|(ega)|(ti)|(em)|(tih)|(emu)|(tim)|(i)|(tima)|"
"(ih)|(ta)|(te)|(to)|(tega)|(tem)|(temu))"
)
_infixes = (
LIST_ELLIPSES
+ LIST_ICONS
+ [
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])(?:{h})(?!{hp}$)(?=[{a}])".format(
a=ALPHA, h=HYPHENS, hp=HYPHENS_PERMITTED
),
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
]
)
TOKENIZER_PREFIXES = _prefixes
TOKENIZER_SUFFIXES = _suffixes
TOKENIZER_INFIXES = _infixes

View File

@ -1,326 +1,84 @@
# Source: https://github.com/stopwords-iso/stopwords-sl
# Removed various words that are not normally considered stop words, such as months.
STOP_WORDS = set(
"""
a
ali
b
bi
bil
bila
bile
bili
bilo
biti
blizu
bo
bodo
bolj
bom
bomo
boste
bova
boš
brez
c
cel
cela
celi
celo
d
da
daleč
dan
danes
do
dober
dobra
dobri
dobro
dokler
dol
dovolj
e
eden
en
ena
ene
eni
enkrat
eno
etc.
a ali
b bi bil bila bile bili bilo biti blizu bo bodo bojo bolj bom bomo
boste bova boš brez
c cel cela celi celo
č če često četrta četrtek četrti četrto čez čigav
d da daleč dan danes datum deset deseta deseti deseto devet
deveta deveti deveto do dober dobra dobri dobro dokler dol dolg
dolga dolgi dovolj drug druga drugi drugo dva dve
e eden en ena ene eni enkrat eno etc.
f
g
g.
ga
ga.
gor
gospa
gospod
h
halo
i
idr.
ii
iii
in
iv
ix
iz
j
jaz
je
ji
jih
jim
jo
k
kadarkoli
kaj
kajti
kako
kakor
kamor
kamorkoli
kar
karkoli
katerikoli
kdaj
kdo
kdorkoli
ker
ki
kje
kjer
kjerkoli
ko
koderkoli
koga
komu
kot
l
le
lep
lepa
lepe
lepi
lepo
m
manj
me
med
medtem
mene
mi
midva
midve
mnogo
moj
moja
moje
mora
morajo
moram
moramo
morate
moraš
morem
mu
n
na
nad
naj
najina
najino
najmanj
naju
največ
nam
nas
nato
nazaj
naš
naša
naše
ne
nedavno
nek
neka
nekaj
nekatere
nekateri
nekatero
nekdo
neke
nekega
neki
nekje
neko
nekoga
nekoč
ni
nikamor
nikdar
nikjer
nikoli
nič
nje
njega
njegov
njegova
njegovo
njej
njemu
njen
njena
njeno
nji
njih
njihov
njihova
njihovo
njiju
njim
njo
njun
njuna
njuno
no
nocoj
npr.
o
ob
oba
obe
oboje
od
okoli
on
onadva
one
oni
onidve
oz.
p
pa
po
pod
pogosto
poleg
ponavadi
ponovno
potem
povsod
prbl.
precej
pred
prej
preko
pri
pribl.
približno
proti
r
redko
res
s
saj
sam
sama
same
sami
samo
se
sebe
sebi
sedaj
sem
seveda
si
sicer
skoraj
skozi
smo
so
spet
sta
ste
sva
t
ta
tak
taka
take
taki
tako
takoj
tam
te
tebe
tebi
tega
ti
tista
tiste
tisti
tisto
tj.
tja
to
toda
tu
tudi
tukaj
tvoj
tvoja
tvoje
g g. ga ga. gor gospa gospod
h halo
i idr. ii iii in iv ix iz
j jaz je ji jih jim jo jutri
k kadarkoli kaj kajti kako kakor kamor kamorkoli kar karkoli
katerikoli kdaj kdo kdorkoli ker ki kje kjer kjerkoli
ko koder koderkoli koga komu kot kratek kratka kratke kratki
l lahka lahke lahki lahko le lep lepa lepe lepi lepo leto
m majhen majhna majhni malce malo manj me med medtem mene
mesec mi midva midve mnogo moj moja moje mora morajo moram
moramo morate moraš morem mu
n na nad naj najina najino najmanj naju največ nam narobe
nas nato nazaj naš naša naše ne nedavno nedelja nek neka
nekaj nekatere nekateri nekatero nekdo neke nekega neki
nekje neko nekoga nekoč ni nikamor nikdar nikjer nikoli
nič nje njega njegov njegova njegovo njej njemu njen
njena njeno nji njih njihov njihova njihovo njiju njim
njo njun njuna njuno no nocoj npr.
o ob oba obe oboje od odprt odprta odprti okoli on
onadva one oni onidve osem osma osmi osmo oz.
p pa pet peta petek peti peto po pod pogosto poleg poln
polna polni polno ponavadi ponedeljek ponovno potem
povsod pozdravljen pozdravljeni prav prava prave pravi
pravo prazen prazna prazno prbl. precej pred prej preko
pri pribl. približno primer pripravljen pripravljena
pripravljeni proti prva prvi prvo
r ravno redko res reč
s saj sam sama same sami samo se sebe sebi sedaj sedem
sedma sedmi sedmo sem seveda si sicer skoraj skozi slab sm
so sobota spet sreda srednja srednji sta ste stran stvar sva
š šest šesta šesti šesto štiri
t ta tak taka take taki tako takoj tam te tebe tebi tega
težak težka težki težko ti tista tiste tisti tisto tj.
tja to toda torek tretja tretje tretji tri tu tudi tukaj
tvoj tvoja tvoje
u
v
vaju
vam
vas
vaš
vaša
vaše
ve
vedno
vendar
ves
več
vi
vidva
vii
viii
vsa
vsaj
vsak
vsaka
vsakdo
vsake
vsaki
vsakomur
vse
vsega
vsi
vso
včasih
x
z
za
zadaj
zadnji
zakaj
zdaj
zelo
zunaj
č
če
često
čez
čigav
š
ž
že
v vaju vam vas vaš vaša vaše ve vedno velik velika veliki
veliko vendar ves več vi vidva vii viii visok visoka visoke
visoki vsa vsaj vsak vsaka vsakdo vsake vsaki vsakomur vse
vsega vsi vso včasih včeraj
x
z za zadaj zadnji zakaj zaprta zaprti zaprto zdaj zelo zunaj
ž že
""".split()
)

View File

@ -0,0 +1,272 @@
from typing import Dict, List
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, NORM
from ...util import update_exc
_exc: Dict[str, List[Dict]] = {}
_other_exc = {
"t.i.": [{ORTH: "t.", NORM: "tako"}, {ORTH: "i.", NORM: "imenovano"}],
"t.j.": [{ORTH: "t.", NORM: "to"}, {ORTH: "j.", NORM: "je"}],
"T.j.": [{ORTH: "T.", NORM: "to"}, {ORTH: "j.", NORM: "je"}],
"d.o.o.": [
{ORTH: "d.", NORM: "družba"},
{ORTH: "o.", NORM: "omejeno"},
{ORTH: "o.", NORM: "odgovornostjo"},
],
"D.O.O.": [
{ORTH: "D.", NORM: "družba"},
{ORTH: "O.", NORM: "omejeno"},
{ORTH: "O.", NORM: "odgovornostjo"},
],
"d.n.o.": [
{ORTH: "d.", NORM: "družba"},
{ORTH: "n.", NORM: "neomejeno"},
{ORTH: "o.", NORM: "odgovornostjo"},
],
"D.N.O.": [
{ORTH: "D.", NORM: "družba"},
{ORTH: "N.", NORM: "neomejeno"},
{ORTH: "O.", NORM: "odgovornostjo"},
],
"d.d.": [{ORTH: "d.", NORM: "delniška"}, {ORTH: "d.", NORM: "družba"}],
"D.D.": [{ORTH: "D.", NORM: "delniška"}, {ORTH: "D.", NORM: "družba"}],
"s.p.": [{ORTH: "s.", NORM: "samostojni"}, {ORTH: "p.", NORM: "podjetnik"}],
"S.P.": [{ORTH: "S.", NORM: "samostojni"}, {ORTH: "P.", NORM: "podjetnik"}],
"l.r.": [{ORTH: "l.", NORM: "lastno"}, {ORTH: "r.", NORM: "ročno"}],
"le-te": [{ORTH: "le"}, {ORTH: "-"}, {ORTH: "te"}],
"Le-te": [{ORTH: "Le"}, {ORTH: "-"}, {ORTH: "te"}],
"le-ti": [{ORTH: "le"}, {ORTH: "-"}, {ORTH: "ti"}],
"Le-ti": [{ORTH: "Le"}, {ORTH: "-"}, {ORTH: "ti"}],
"le-to": [{ORTH: "le"}, {ORTH: "-"}, {ORTH: "to"}],
"Le-to": [{ORTH: "Le"}, {ORTH: "-"}, {ORTH: "to"}],
"le-ta": [{ORTH: "le"}, {ORTH: "-"}, {ORTH: "ta"}],
"Le-ta": [{ORTH: "Le"}, {ORTH: "-"}, {ORTH: "ta"}],
"le-tega": [{ORTH: "le"}, {ORTH: "-"}, {ORTH: "tega"}],
"Le-tega": [{ORTH: "Le"}, {ORTH: "-"}, {ORTH: "tega"}],
}
_exc.update(_other_exc)
for exc_data in [
{ORTH: "adm.", NORM: "administracija"},
{ORTH: "aer.", NORM: "aeronavtika"},
{ORTH: "agr.", NORM: "agronomija"},
{ORTH: "amer.", NORM: "ameriško"},
{ORTH: "anat.", NORM: "anatomija"},
{ORTH: "angl.", NORM: "angleški"},
{ORTH: "ant.", NORM: "antonim"},
{ORTH: "antr.", NORM: "antropologija"},
{ORTH: "apr.", NORM: "april"},
{ORTH: "arab.", NORM: "arabsko"},
{ORTH: "arheol.", NORM: "arheologija"},
{ORTH: "arhit.", NORM: "arhitektura"},
{ORTH: "avg.", NORM: "avgust"},
{ORTH: "avstr.", NORM: "avstrijsko"},
{ORTH: "avt.", NORM: "avtomobilizem"},
{ORTH: "bibl.", NORM: "biblijsko"},
{ORTH: "biokem.", NORM: "biokemija"},
{ORTH: "biol.", NORM: "biologija"},
{ORTH: "bolg.", NORM: "bolgarski"},
{ORTH: "bot.", NORM: "botanika"},
{ORTH: "cit.", NORM: "citat"},
{ORTH: "daj.", NORM: "dajalnik"},
{ORTH: "del.", NORM: "deležnik"},
{ORTH: "ed.", NORM: "ednina"},
{ORTH: "etn.", NORM: "etnografija"},
{ORTH: "farm.", NORM: "farmacija"},
{ORTH: "filat.", NORM: "filatelija"},
{ORTH: "filoz.", NORM: "filozofija"},
{ORTH: "fin.", NORM: "finančništvo"},
{ORTH: "fiz.", NORM: "fizika"},
{ORTH: "fot.", NORM: "fotografija"},
{ORTH: "fr.", NORM: "francoski"},
{ORTH: "friz.", NORM: "frizerstvo"},
{ORTH: "gastr.", NORM: "gastronomija"},
{ORTH: "geogr.", NORM: "geografija"},
{ORTH: "geol.", NORM: "geologija"},
{ORTH: "geom.", NORM: "geometrija"},
{ORTH: "germ.", NORM: "germanski"},
{ORTH: "gl.", NORM: "glej"},
{ORTH: "glag.", NORM: "glagolski"},
{ORTH: "glasb.", NORM: "glasba"},
{ORTH: "gled.", NORM: "gledališče"},
{ORTH: "gost.", NORM: "gostinstvo"},
{ORTH: "gozd.", NORM: "gozdarstvo"},
{ORTH: "gr.", NORM: "grški"},
{ORTH: "grad.", NORM: "gradbeništvo"},
{ORTH: "hebr.", NORM: "hebrejsko"},
{ORTH: "hrv.", NORM: "hrvaško"},
{ORTH: "ide.", NORM: "indoevropsko"},
{ORTH: "igr.", NORM: "igre"},
{ORTH: "im.", NORM: "imenovalnik"},
{ORTH: "iron.", NORM: "ironično"},
{ORTH: "it.", NORM: "italijanski"},
{ORTH: "itd.", NORM: "in tako dalje"},
{ORTH: "itn.", NORM: "in tako naprej"},
{ORTH: "ipd.", NORM: "in podobno"},
{ORTH: "jap.", NORM: "japonsko"},
{ORTH: "jul.", NORM: "julij"},
{ORTH: "jun.", NORM: "junij"},
{ORTH: "kit.", NORM: "kitajsko"},
{ORTH: "knj.", NORM: "knjižno"},
{ORTH: "knjiž.", NORM: "knjižno"},
{ORTH: "kor.", NORM: "koreografija"},
{ORTH: "lat.", NORM: "latinski"},
{ORTH: "les.", NORM: "lesna stroka"},
{ORTH: "lingv.", NORM: "lingvistika"},
{ORTH: "lit.", NORM: "literarni"},
{ORTH: "ljubk.", NORM: "ljubkovalno"},
{ORTH: "lov.", NORM: "lovstvo"},
{ORTH: "m.", NORM: "moški"},
{ORTH: "mak.", NORM: "makedonski"},
{ORTH: "mar.", NORM: "marec"},
{ORTH: "mat.", NORM: "matematika"},
{ORTH: "med.", NORM: "medicina"},
{ORTH: "meh.", NORM: "mehiško"},
{ORTH: "mest.", NORM: "mestnik"},
{ORTH: "mdr.", NORM: "med drugim"},
{ORTH: "min.", NORM: "mineralogija"},
{ORTH: "mitol.", NORM: "mitologija"},
{ORTH: "mn.", NORM: "množina"},
{ORTH: "mont.", NORM: "montanistika"},
{ORTH: "muz.", NORM: "muzikologija"},
{ORTH: "nam.", NORM: "namenilnik"},
{ORTH: "nar.", NORM: "narečno"},
{ORTH: "nav.", NORM: "navadno"},
{ORTH: "nedol.", NORM: "nedoločnik"},
{ORTH: "nedov.", NORM: "nedovršni"},
{ORTH: "neprav.", NORM: "nepravilno"},
{ORTH: "nepreh.", NORM: "neprehodno"},
{ORTH: "neskl.", NORM: "nesklonljiv(o)"},
{ORTH: "nestrok.", NORM: "nestrokovno"},
{ORTH: "num.", NORM: "numizmatika"},
{ORTH: "npr.", NORM: "na primer"},
{ORTH: "obrt.", NORM: "obrtništvo"},
{ORTH: "okt.", NORM: "oktober"},
{ORTH: "or.", NORM: "orodnik"},
{ORTH: "os.", NORM: "oseba"},
{ORTH: "otr.", NORM: "otroško"},
{ORTH: "oz.", NORM: "oziroma"},
{ORTH: "pal.", NORM: "paleontologija"},
{ORTH: "papir.", NORM: "papirništvo"},
{ORTH: "ped.", NORM: "pedagogika"},
{ORTH: "pisar.", NORM: "pisarniško"},
{ORTH: "pog.", NORM: "pogovorno"},
{ORTH: "polit.", NORM: "politika"},
{ORTH: "polj.", NORM: "poljsko"},
{ORTH: "poljud.", NORM: "poljudno"},
{ORTH: "preg.", NORM: "pregovor"},
{ORTH: "preh.", NORM: "prehodno"},
{ORTH: "pren.", NORM: "preneseno"},
{ORTH: "prid.", NORM: "pridevnik"},
{ORTH: "prim.", NORM: "primerjaj"},
{ORTH: "prisl.", NORM: "prislov"},
{ORTH: "psih.", NORM: "psihologija"},
{ORTH: "psiht.", NORM: "psihiatrija"},
{ORTH: "rad.", NORM: "radiotehnika"},
{ORTH: "rač.", NORM: "računalništvo"},
{ORTH: "rib.", NORM: "ribištvo"},
{ORTH: "rod.", NORM: "rodilnik"},
{ORTH: "rus.", NORM: "rusko"},
{ORTH: "s.", NORM: "srednji"},
{ORTH: "sam.", NORM: "samostalniški"},
{ORTH: "sed.", NORM: "sedanjik"},
{ORTH: "sep.", NORM: "september"},
{ORTH: "slabš.", NORM: "slabšalno"},
{ORTH: "slovan.", NORM: "slovansko"},
{ORTH: "slovaš.", NORM: "slovaško"},
{ORTH: "srb.", NORM: "srbsko"},
{ORTH: "star.", NORM: "starinsko"},
{ORTH: "stil.", NORM: "stilno"},
{ORTH: "sv.", NORM: "svet(i)"},
{ORTH: "teh.", NORM: "tehnika"},
{ORTH: "tisk.", NORM: "tiskarstvo"},
{ORTH: "tj.", NORM: "to je"},
{ORTH: "tož.", NORM: "tožilnik"},
{ORTH: "trg.", NORM: "trgovina"},
{ORTH: "ukr.", NORM: "ukrajinski"},
{ORTH: "um.", NORM: "umetnost"},
{ORTH: "vel.", NORM: "velelnik"},
{ORTH: "vet.", NORM: "veterina"},
{ORTH: "vez.", NORM: "veznik"},
{ORTH: "vn.", NORM: "visokonemško"},
{ORTH: "voj.", NORM: "vojska"},
{ORTH: "vrtn.", NORM: "vrtnarstvo"},
{ORTH: "vulg.", NORM: "vulgarno"},
{ORTH: "vznes.", NORM: "vzneseno"},
{ORTH: "zal.", NORM: "založništvo"},
{ORTH: "zastar.", NORM: "zastarelo"},
{ORTH: "zgod.", NORM: "zgodovina"},
{ORTH: "zool.", NORM: "zoologija"},
{ORTH: "čeb.", NORM: "čebelarstvo"},
{ORTH: "češ.", NORM: "češki"},
{ORTH: "člov.", NORM: "človeškost"},
{ORTH: "šah.", NORM: "šahovski"},
{ORTH: "šalj.", NORM: "šaljivo"},
{ORTH: "šp.", NORM: "španski"},
{ORTH: "špan.", NORM: "špansko"},
{ORTH: "šport.", NORM: "športni"},
{ORTH: "štev.", NORM: "števnik"},
{ORTH: "šved.", NORM: "švedsko"},
{ORTH: "švic.", NORM: "švicarsko"},
{ORTH: "ž.", NORM: "ženski"},
{ORTH: "žarg.", NORM: "žargonsko"},
{ORTH: "žel.", NORM: "železnica"},
{ORTH: "živ.", NORM: "živost"},
]:
_exc[exc_data[ORTH]] = [exc_data]
abbrv = """
Co. Ch. DIPL. DR. Dr. Ev. Inc. Jr. Kr. Mag. M. MR. Mr. Mt. Murr. Npr. OZ.
Opr. Osn. Prim. Roj. ST. Sim. Sp. Sred. St. Sv. Škofl. Tel. UR. Zb.
a. aa. ab. abc. abit. abl. abs. abt. acc. accel. add. adj. adv. aet. afr. akad. al. alban. all. alleg.
alp. alt. alter. alžir. am. an. andr. ang. anh. anon. ans. antrop. apoc. app. approx. apt. ar. arc. arch.
arh. arr. as. asist. assist. assoc. asst. astr. attn. aug. avstral. az. b. bab. bal. bbl. bd. belg. bioinf.
biomed. bk. bl. bn. borg. bp. br. braz. brit. bros. broš. bt. bu. c. ca. cal. can. cand. cantab. cap. capt.
cat. cath. cc. cca. cd. cdr. cdre. cent. cerkv. cert. cf. cfr. ch. chap. chem. chr. chs. cic. circ. civ. cl.
cm. cmd. cnr. co. cod. col. coll. colo. com. comp. con. conc. cond. conn. cons. cont. coop. corr. cost. cp.
cpl. cr. crd. cres. cresc. ct. cu. d. dan. dat. davč. ddr. dec. ded. def. dem. dent. dept. dia. dip. dipl.
dir. disp. diss. div. do. doc. dok. dol. doo. dop. dott. dr. dram. druž. družb. drž. dt. duh. dur. dvr. dwt. e.
ea. ecc. eccl. eccles. econ. edn. egipt. egr. ekon. eksp. el. em. enc. eng. eo. ep. err. esp. esq. est.
et. etc. etnogr. etnol. ev. evfem. evr. ex. exc. excl. exp. expl. ext. exx. f. fa. facs. fak. faks. fas.
fasc. fco. fcp. feb. febr. fec. fed. fem. ff. fff. fid. fig. fil. film. fiziol. fiziot. flam. fm. fo. fol. folk.
frag. fran. franc. fsc. g. ga. gal. gdč. ge. gen. geod. geog. geotehnol. gg. gimn. glas. glav. gnr. go. gor.
gosp. gp. graf. gram. gren. grš. gs. h. hab. hf. hist. ho. hort. i. ia. ib. ibid. id. idr. idridr. ill. imen.
imp. impf. impr. in. inc. incl. ind. indus. inf. inform. ing. init. ins. int. inv. inšp. inštr. inž. is. islam.
ist. ital. iur. iz. izbr. izd. izg. izgr. izr. izv. j. jak. jam. jan. jav. je. jez. jr. jsl. jud. jug.
jugoslovan. jur. juž. jv. jz. k. kal. kan. kand. kat. kdo. kem. kip. kmet. kol. kom. komp. konf. kont. kost. kov.
kp. kpfw. kr. kraj. krat. kub. kult. kv. kval. l. la. lab. lb. ld. let. lib. lik. litt. lj. ljud. ll. loc. log.
loč. lt. ma. madž. mag. manag. manjš. masc. mass. mater. max. maxmax. mb. md. mech. medic. medij. medn.
mehč. mem. menedž. mes. mess. metal. meteor. meteorol. mex. mi. mikr. mil. minn. mio. misc. miss. mit. mk.
mkt. ml. mlad. mlle. mlr. mm. mme. množ. mo. moj. moš. možn. mr. mrd. mrs. ms. msc. msgr. mt. murr. mus. mut.
n. na. nad. nadalj. nadom. nagl. nakl. namer. nan. naniz. nasl. nat. navt. nač. ned. nem. nik. nizoz. nm. nn.
no. nom. norv. notr. nov. novogr. ns. o. ob. obd. obj. oblač. obl. oblik. obr. obraz. obs. obst. obt. obč. oc.
oct. od. odd. odg. odn. odst. odv. oec. off. ok. okla. okr. ont. oo. op. opis. opp. opr. orch. ord. ore. oreg.
org. orient. orig. ork. ort. oseb. osn. ot. ozir. ošk. p. pag. par. para. parc. parl. part. past. pat. pdk.
pen. perf. pert. perz. pesn. pet. pev. pf. pfc. ph. pharm. phil. pis. pl. po. pod. podr. podaljš. pogl. pogoj. pojm.
pok. pokr. pol. poljed. poljub. polu. pom. pomen. pon. ponov. pop. por. port. pos. posl. posn. pov. pp. ppl. pr.
praet. prav. pravopis. pravosl. preb. pred. predl. predm. predp. preds. pref. pregib. prel. prem. premen. prep.
pres. pret. prev. pribl. prih. pril. primerj. primor. prip. pripor. prir. prist. priv. proc. prof. prog. proiz.
prom. pron. prop. prot. protest. prov. ps. pss. pt. publ. pz. q. qld. qu. quad. que. r. racc. rastl. razgl.
razl. razv. rd. red. ref. reg. rel. relig. rep. repr. rer. resp. rest. ret. rev. revol. rež. rim. rist. rkp. rm.
roj. rom. romun. rp. rr. rt. rud. ruš. ry. sal. samogl. san. sc. scen. sci. scr. sdv. seg. sek. sen. sept. ser.
sev. sg. sgt. sh. sig. sigg. sign. sim. sin. sing. sinh. skand. skl. sklad. sklanj. sklep. skr. sl. slik. slov.
slovak. slovn. sn. so. sob. soc. sociol. sod. sopomen. sopr. sor. sov. sovj. sp. spec. spl. spr. spreg. sq. sr.
sre. sred. sredoz. srh. ss. ssp. st. sta. stan. stanstar. stcsl. ste. stim. stol. stom. str. stroj. strok. stsl.
stud. sup. supl. suppl. svet. sz. t. tab. tech. ted. tehn. tehnol. tek. teks. tekst. tel. temp. ten. teol. ter.
term. test. th. theol. tim. tip. tisočl. tit. tl. tol. tolmač. tom. tor. tov. tr. trad. traj. trans. tren.
trib. tril. trop. trp. trž. ts. tt. tu. tur. turiz. tvor. tvorb. . u. ul. umet. un. univ. up. upr. ur. urad.
us. ust. utr. v. va. val. var. varn. ven. ver. verb. vest. vezal. vic. vis. viv. viz. viš. vod. vok. vol. vpr.
vrst. vrstil. vs. vv. vzd. vzg. vzh. vzor. w. wed. wg. wk. x. y. z. zah. zaim. zak. zap. zasl. zavar. zač. zb.
združ. zg. zn. znan. znanstv. zoot. zun. zv. zvd. á. é. ć. č. čas. čet. čl. člen. čustv. đ. ľ. ł. ş. ŠT. š. šir.
škofl. škot. šol. št. števil. štud. ů. ű. žen. žival.
""".split()
for orth in abbrv:
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -17,10 +17,6 @@ URL_PATTERN = (
r"(?:\S+(?::\S*)?@)?"
r"(?:"
# IP address exclusion
# private & local networks
r"(?!(?:10|127)(?:\.\d{1,3}){3})"
r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
# IP address dotted notation octets
# excludes loopback network 0.0.0.0
# excludes reserved space >= 224.0.0.0

View File

@ -29,7 +29,7 @@ class Ukrainian(Language):
assigns=["token.lemma"],
default_config={
"model": None,
"mode": "pymorphy2",
"mode": "pymorphy3",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},

View File

@ -14,7 +14,7 @@ class UkrainianLemmatizer(RussianLemmatizer):
model: Optional[Model],
name: str = "lemmatizer",
*,
mode: str = "pymorphy2",
mode: str = "pymorphy3",
overwrite: bool = False,
scorer: Optional[Callable] = lemmatizer_score,
) -> None:
@ -29,6 +29,17 @@ class UkrainianLemmatizer(RussianLemmatizer):
) from None
if getattr(self, "_morph", None) is None:
self._morph = MorphAnalyzer(lang="uk")
elif mode == "pymorphy3":
try:
from pymorphy3 import MorphAnalyzer
except ImportError:
raise ImportError(
"The Ukrainian lemmatizer mode 'pymorphy3' requires the "
"pymorphy3 library and dictionaries. Install them with: "
"pip install pymorphy3 pymorphy3-dicts-uk"
) from None
if getattr(self, "_morph", None) is None:
self._morph = MorphAnalyzer(lang="uk")
super().__init__(
vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)

View File

@ -465,6 +465,8 @@ class Language:
"""
if not isinstance(name, str):
raise ValueError(Errors.E963.format(decorator="factory"))
if "." in name:
raise ValueError(Errors.E853.format(name=name))
if not isinstance(default_config, dict):
err = Errors.E962.format(
style="default config", name=name, cfg_type=type(default_config)
@ -543,8 +545,11 @@ class Language:
DOCS: https://spacy.io/api/language#component
"""
if name is not None and not isinstance(name, str):
raise ValueError(Errors.E963.format(decorator="component"))
if name is not None:
if not isinstance(name, str):
raise ValueError(Errors.E963.format(decorator="component"))
if "." in name:
raise ValueError(Errors.E853.format(name=name))
component_name = name if name is not None else util.get_object_name(func)
def add_component(component_func: "Pipe") -> Callable:

View File

@ -1,9 +1,9 @@
from .attributeruler import AttributeRuler
from .attribute_ruler import AttributeRuler
from .dep_parser import DependencyParser
from .edit_tree_lemmatizer import EditTreeLemmatizer
from .entity_linker import EntityLinker
from .ner import EntityRecognizer
from .entityruler import EntityRuler
from .entity_ruler import EntityRuler
from .lemmatizer import Lemmatizer
from .morphologizer import Morphologizer
from .pipe import Pipe

View File

@ -207,7 +207,7 @@ class TokenPatternOperatorSimple(str, Enum):
class TokenPatternOperatorMinMax(ConstrainedStr):
regex = re.compile("^({\d+}|{\d+,\d*}|{\d*,\d+})$")
regex = re.compile(r"^({\d+}|{\d+,\d*}|{\d*,\d+})$")
TokenPatternOperator = Union[TokenPatternOperatorSimple, TokenPatternOperatorMinMax]
@ -514,6 +514,14 @@ class DocJSONSchema(BaseModel):
tokens: List[Dict[StrictStr, Union[StrictStr, StrictInt]]] = Field(
..., title="Token information - ID, start, annotations"
)
_: Optional[Dict[StrictStr, Any]] = Field(
None, title="Any custom data stored in the document's _ attribute"
underscore_doc: Optional[Dict[StrictStr, Any]] = Field(
None,
title="Any custom data stored in the document's _ attribute",
alias="_",
)
underscore_token: Optional[Dict[StrictStr, Dict[StrictStr, Any]]] = Field(
None, title="Any custom data stored in the token's _ attribute"
)
underscore_span: Optional[Dict[StrictStr, Dict[StrictStr, Any]]] = Field(
None, title="Any custom data stored in the span's _ attribute"
)

View File

@ -239,7 +239,7 @@ def hsb_tokenizer():
@pytest.fixture(scope="session")
def ko_tokenizer():
pytest.importorskip("natto")
pytest.importorskip("mecab_ko")
return get_lang_class("ko")().tokenizer
@ -256,11 +256,30 @@ def ko_tokenizer_tokenizer():
return nlp.tokenizer
@pytest.fixture(scope="session")
def ko_tokenizer_natto():
pytest.importorskip("natto")
config = {
"nlp": {
"tokenizer": {
"@tokenizers": "spacy.KoreanNattoTokenizer.v1",
}
}
}
nlp = get_lang_class("ko").from_config(config)
return nlp.tokenizer
@pytest.fixture(scope="session")
def lb_tokenizer():
return get_lang_class("lb")().tokenizer
@pytest.fixture(scope="session")
def lg_tokenizer():
return get_lang_class("lg")().tokenizer
@pytest.fixture(scope="session")
def lt_tokenizer():
return get_lang_class("lt")().tokenizer
@ -323,13 +342,13 @@ def ro_tokenizer():
@pytest.fixture(scope="session")
def ru_tokenizer():
pytest.importorskip("pymorphy2")
pytest.importorskip("pymorphy3")
return get_lang_class("ru")().tokenizer
@pytest.fixture
def ru_lemmatizer():
pytest.importorskip("pymorphy2")
pytest.importorskip("pymorphy3")
return get_lang_class("ru")().add_pipe("lemmatizer")
@ -401,14 +420,14 @@ def ky_tokenizer():
@pytest.fixture(scope="session")
def uk_tokenizer():
pytest.importorskip("pymorphy2")
pytest.importorskip("pymorphy3")
return get_lang_class("uk")().tokenizer
@pytest.fixture
def uk_lemmatizer():
pytest.importorskip("pymorphy2")
pytest.importorskip("pymorphy2_dicts_uk")
pytest.importorskip("pymorphy3")
pytest.importorskip("pymorphy3_dicts_uk")
return get_lang_class("uk")().add_pipe("lemmatizer")

View File

@ -45,6 +45,33 @@ def test_ents_reset(en_vocab):
assert [t.ent_iob_ for t in doc] == orig_iobs
def test_ents_clear(en_vocab):
"""Ensure that removing entities clears token attributes"""
text = ["Louisiana", "Office", "of", "Conservation"]
doc = Doc(en_vocab, words=text)
entity = Span(doc, 0, 4, label=391, span_id="TEST")
doc.ents = [entity]
doc.ents = []
for token in doc:
assert token.ent_iob == 2
assert token.ent_type == 0
assert token.ent_id == 0
assert token.ent_kb_id == 0
doc.ents = [entity]
doc.set_ents([], default="missing")
for token in doc:
assert token.ent_iob == 0
assert token.ent_type == 0
assert token.ent_id == 0
assert token.ent_kb_id == 0
doc.set_ents([], default="blocked")
for token in doc:
assert token.ent_iob == 3
assert token.ent_type == 0
assert token.ent_id == 0
assert token.ent_kb_id == 0
def test_add_overlapping_entities(en_vocab):
text = ["Louisiana", "Office", "of", "Conservation"]
doc = Doc(en_vocab, words=text)

View File

@ -3,6 +3,7 @@ import weakref
import numpy
from numpy.testing import assert_array_equal
import pytest
import warnings
from thinc.api import NumpyOps, get_current_ops
from spacy.attrs import DEP, ENT_IOB, ENT_TYPE, HEAD, IS_ALPHA, MORPH, POS
@ -529,9 +530,9 @@ def test_doc_from_array_sent_starts(en_vocab):
# no warning using default attrs
attrs = doc._get_array_attrs()
arr = doc.to_array(attrs)
with pytest.warns(None) as record:
with warnings.catch_warnings():
warnings.simplefilter("error")
new_doc.from_array(attrs, arr)
assert len(record) == 0
# only SENT_START uses SENT_START
attrs = [SENT_START]
arr = doc.to_array(attrs)

View File

@ -1,12 +1,15 @@
import pytest
import spacy
from spacy import schemas
from spacy.tokens import Doc, Span
from spacy.tokens import Doc, Span, Token
import srsly
from .test_underscore import clean_underscore # noqa: F401
@pytest.fixture()
def doc(en_vocab):
words = ["c", "d", "e"]
spaces = [True, True, True]
pos = ["VERB", "NOUN", "NOUN"]
tags = ["VBP", "NN", "NN"]
heads = [0, 0, 1]
@ -17,6 +20,7 @@ def doc(en_vocab):
return Doc(
en_vocab,
words=words,
spaces=spaces,
pos=pos,
tags=tags,
heads=heads,
@ -45,6 +49,47 @@ def doc_without_deps(en_vocab):
)
@pytest.fixture()
def doc_json():
return {
"text": "c d e ",
"ents": [{"start": 2, "end": 3, "label": "ORG"}],
"sents": [{"start": 0, "end": 5}],
"tokens": [
{
"id": 0,
"start": 0,
"end": 1,
"tag": "VBP",
"pos": "VERB",
"morph": "Feat1=A",
"dep": "ROOT",
"head": 0,
},
{
"id": 1,
"start": 2,
"end": 3,
"tag": "NN",
"pos": "NOUN",
"morph": "Feat1=B",
"dep": "dobj",
"head": 0,
},
{
"id": 2,
"start": 4,
"end": 5,
"tag": "NN",
"pos": "NOUN",
"morph": "Feat1=A|Feat2=D",
"dep": "dobj",
"head": 1,
},
],
}
def test_doc_to_json(doc):
json_doc = doc.to_json()
assert json_doc["text"] == "c d e "
@ -56,7 +101,8 @@ def test_doc_to_json(doc):
assert json_doc["ents"][0]["start"] == 2 # character offset!
assert json_doc["ents"][0]["end"] == 3 # character offset!
assert json_doc["ents"][0]["label"] == "ORG"
assert not schemas.validate(schemas.DocJSONSchema, json_doc)
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
def test_doc_to_json_underscore(doc):
@ -64,11 +110,96 @@ def test_doc_to_json_underscore(doc):
Doc.set_extension("json_test2", default=False)
doc._.json_test1 = "hello world"
doc._.json_test2 = [1, 2, 3]
json_doc = doc.to_json(underscore=["json_test1", "json_test2"])
assert "_" in json_doc
assert json_doc["_"]["json_test1"] == "hello world"
assert json_doc["_"]["json_test2"] == [1, 2, 3]
assert not schemas.validate(schemas.DocJSONSchema, json_doc)
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
def test_doc_to_json_with_token_span_attributes(doc):
Doc.set_extension("json_test1", default=False)
Doc.set_extension("json_test2", default=False)
Token.set_extension("token_test", default=False)
Span.set_extension("span_test", default=False)
doc._.json_test1 = "hello world"
doc._.json_test2 = [1, 2, 3]
doc[0:1]._.span_test = "span_attribute"
doc[0]._.token_test = 117
doc.spans["span_group"] = [doc[0:1]]
json_doc = doc.to_json(
underscore=["json_test1", "json_test2", "token_test", "span_test"]
)
assert "_" in json_doc
assert json_doc["_"]["json_test1"] == "hello world"
assert json_doc["_"]["json_test2"] == [1, 2, 3]
assert "underscore_token" in json_doc
assert "underscore_span" in json_doc
assert json_doc["underscore_token"]["token_test"]["value"] == 117
assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute"
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
def test_doc_to_json_with_custom_user_data(doc):
Doc.set_extension("json_test", default=False)
Token.set_extension("token_test", default=False)
Span.set_extension("span_test", default=False)
doc._.json_test = "hello world"
doc[0:1]._.span_test = "span_attribute"
doc[0]._.token_test = 117
json_doc = doc.to_json(underscore=["json_test", "token_test", "span_test"])
doc.user_data["user_data_test"] = 10
doc.user_data[("user_data_test2", True)] = 10
assert "_" in json_doc
assert json_doc["_"]["json_test"] == "hello world"
assert "underscore_token" in json_doc
assert "underscore_span" in json_doc
assert json_doc["underscore_token"]["token_test"]["value"] == 117
assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute"
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
def test_doc_to_json_with_token_span_same_identifier(doc):
Doc.set_extension("my_ext", default=False)
Token.set_extension("my_ext", default=False)
Span.set_extension("my_ext", default=False)
doc._.my_ext = "hello world"
doc[0:1]._.my_ext = "span_attribute"
doc[0]._.my_ext = 117
json_doc = doc.to_json(underscore=["my_ext"])
assert "_" in json_doc
assert json_doc["_"]["my_ext"] == "hello world"
assert "underscore_token" in json_doc
assert "underscore_span" in json_doc
assert json_doc["underscore_token"]["my_ext"]["value"] == 117
assert json_doc["underscore_span"]["my_ext"]["value"] == "span_attribute"
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
def test_doc_to_json_with_token_attributes_missing(doc):
Token.set_extension("token_test", default=False)
Span.set_extension("span_test", default=False)
doc[0:1]._.span_test = "span_attribute"
doc[0]._.token_test = 117
json_doc = doc.to_json(underscore=["span_test"])
assert "underscore_token" in json_doc
assert "underscore_span" in json_doc
assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute"
assert "token_test" not in json_doc["underscore_token"]
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
def test_doc_to_json_underscore_error_attr(doc):
@ -94,11 +225,29 @@ def test_doc_to_json_span(doc):
assert len(json_doc["spans"]) == 1
assert len(json_doc["spans"]["test"]) == 2
assert json_doc["spans"]["test"][0]["start"] == 0
assert not schemas.validate(schemas.DocJSONSchema, json_doc)
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
def test_json_to_doc(doc):
new_doc = Doc(doc.vocab).from_json(doc.to_json(), validate=True)
json_doc = doc.to_json()
json_doc = srsly.json_loads(srsly.json_dumps(json_doc))
new_doc = Doc(doc.vocab).from_json(json_doc, validate=True)
assert new_doc.text == doc.text == "c d e "
assert len(new_doc) == len(doc) == 3
assert new_doc[0].pos == doc[0].pos
assert new_doc[0].tag == doc[0].tag
assert new_doc[0].dep == doc[0].dep
assert new_doc[0].head.idx == doc[0].head.idx
assert new_doc[0].lemma == doc[0].lemma
assert len(new_doc.ents) == 1
assert new_doc.ents[0].start == 1
assert new_doc.ents[0].end == 2
assert new_doc.ents[0].label_ == "ORG"
assert doc.to_bytes() == new_doc.to_bytes()
def test_json_to_doc_compat(doc, doc_json):
new_doc = Doc(doc.vocab).from_json(doc_json, validate=True)
new_tokens = [token for token in new_doc]
assert new_doc.text == doc.text == "c d e "
assert len(new_tokens) == len([token for token in doc]) == 3
@ -114,11 +263,8 @@ def test_json_to_doc(doc):
def test_json_to_doc_underscore(doc):
if not Doc.has_extension("json_test1"):
Doc.set_extension("json_test1", default=False)
if not Doc.has_extension("json_test2"):
Doc.set_extension("json_test2", default=False)
Doc.set_extension("json_test1", default=False)
Doc.set_extension("json_test2", default=False)
doc._.json_test1 = "hello world"
doc._.json_test2 = [1, 2, 3]
json_doc = doc.to_json(underscore=["json_test1", "json_test2"])
@ -126,6 +272,34 @@ def test_json_to_doc_underscore(doc):
assert all([new_doc.has_extension(f"json_test{i}") for i in range(1, 3)])
assert new_doc._.json_test1 == "hello world"
assert new_doc._.json_test2 == [1, 2, 3]
assert doc.to_bytes() == new_doc.to_bytes()
def test_json_to_doc_with_token_span_attributes(doc):
Doc.set_extension("json_test1", default=False)
Doc.set_extension("json_test2", default=False)
Token.set_extension("token_test", default=False)
Span.set_extension("span_test", default=False)
doc._.json_test1 = "hello world"
doc._.json_test2 = [1, 2, 3]
doc[0:1]._.span_test = "span_attribute"
doc[0]._.token_test = 117
json_doc = doc.to_json(
underscore=["json_test1", "json_test2", "token_test", "span_test"]
)
json_doc = srsly.json_loads(srsly.json_dumps(json_doc))
new_doc = Doc(doc.vocab).from_json(json_doc, validate=True)
assert all([new_doc.has_extension(f"json_test{i}") for i in range(1, 3)])
assert new_doc._.json_test1 == "hello world"
assert new_doc._.json_test2 == [1, 2, 3]
assert new_doc[0]._.token_test == 117
assert new_doc[0:1]._.span_test == "span_attribute"
assert new_doc.user_data == doc.user_data
assert new_doc.to_bytes(exclude=["user_data"]) == doc.to_bytes(
exclude=["user_data"]
)
def test_json_to_doc_spans(doc):

View File

@ -692,3 +692,23 @@ def test_span_group_copy(doc):
assert len(doc.spans["test"]) == 3
# check that the copy spans were not modified and this is an isolated doc
assert len(doc_copy.spans["test"]) == 2
@pytest.mark.issue(11113)
def test_span_ent_id(en_tokenizer):
doc = en_tokenizer("a b c d")
doc.ents = [Span(doc, 1, 3, label="A", span_id="ID0")]
span = doc.ents[0]
assert doc[1].ent_id_ == "ID0"
# setting Span.id sets Token.ent_id
span.id_ = "ID1"
doc.ents = [span]
assert doc.ents[0].ent_id_ == "ID1"
assert doc[1].ent_id_ == "ID1"
# Span.ent_id is an alias of Span.id
span.ent_id_ = "ID2"
doc.ents = [span]
assert doc.ents[0].ent_id_ == "ID2"
assert doc[1].ent_id_ == "ID2"

View File

@ -7,3 +7,11 @@ import pytest
def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma):
test_lemma = ko_tokenizer(word)[0].lemma_
assert test_lemma == lemma
@pytest.mark.parametrize(
"word,lemma", [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", ""), ("뭡니까", ""), ("됐다", "")]
)
def test_ko_lemmatizer_natto_assigns(ko_tokenizer_natto, word, lemma):
test_lemma = ko_tokenizer_natto(word)[0].lemma_
assert test_lemma == lemma

View File

@ -22,3 +22,23 @@ def test_ko_tokenizer_pickle(ko_tokenizer):
b = pickle.dumps(ko_tokenizer)
ko_tokenizer_re = pickle.loads(b)
assert ko_tokenizer.to_bytes() == ko_tokenizer_re.to_bytes()
def test_ko_tokenizer_natto_serialize(ko_tokenizer_natto):
tokenizer_bytes = ko_tokenizer_natto.to_bytes()
nlp = Korean()
nlp.tokenizer.from_bytes(tokenizer_bytes)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
with make_tempdir() as d:
file_path = d / "tokenizer"
ko_tokenizer_natto.to_disk(file_path)
nlp = Korean()
nlp.tokenizer.from_disk(file_path)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
def test_ko_tokenizer_natto_pickle(ko_tokenizer_natto):
b = pickle.dumps(ko_tokenizer_natto)
ko_tokenizer_natto_re = pickle.loads(b)
assert ko_tokenizer_natto.to_bytes() == ko_tokenizer_natto_re.to_bytes()

View File

@ -19,6 +19,8 @@ POS_TESTS = [("서울 타워 근처에 살고 있습니다.",
"PROPN ADP VERB X NOUN ADV VERB AUX X PUNCT")]
# fmt: on
# tests for ko_tokenizer (default KoreanTokenizer)
@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
def test_ko_tokenizer(ko_tokenizer, text, expected_tokens):
@ -44,7 +46,7 @@ def test_ko_tokenizer_pos(ko_tokenizer, text, expected_pos):
assert pos == expected_pos.split()
def test_ko_empty_doc(ko_tokenizer):
def test_ko_tokenizer_empty_doc(ko_tokenizer):
tokens = ko_tokenizer("")
assert len(tokens) == 0
@ -55,6 +57,44 @@ def test_ko_tokenizer_unknown_tag(ko_tokenizer):
assert tokens[1].pos_ == "X"
# same tests for ko_tokenizer_natto (KoreanNattoTokenizer)
@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
def test_ko_tokenizer_natto(ko_tokenizer_natto, text, expected_tokens):
tokens = [token.text for token in ko_tokenizer_natto(text)]
assert tokens == expected_tokens.split()
@pytest.mark.parametrize("text,expected_tags", TAG_TESTS)
def test_ko_tokenizer_natto_tags(ko_tokenizer_natto, text, expected_tags):
tags = [token.tag_ for token in ko_tokenizer_natto(text)]
assert tags == expected_tags.split()
@pytest.mark.parametrize("text,expected_tags", FULL_TAG_TESTS)
def test_ko_tokenizer_natto_full_tags(ko_tokenizer_natto, text, expected_tags):
tags = ko_tokenizer_natto(text).user_data["full_tags"]
assert tags == expected_tags.split()
@pytest.mark.parametrize("text,expected_pos", POS_TESTS)
def test_ko_tokenizer_natto_pos(ko_tokenizer_natto, text, expected_pos):
pos = [token.pos_ for token in ko_tokenizer_natto(text)]
assert pos == expected_pos.split()
def test_ko_tokenizer_natto_empty_doc(ko_tokenizer_natto):
tokens = ko_tokenizer_natto("")
assert len(tokens) == 0
@pytest.mark.issue(10535)
def test_ko_tokenizer_natto_unknown_tag(ko_tokenizer_natto):
tokens = ko_tokenizer_natto("미닛 리피터")
assert tokens[1].pos_ == "X"
# fmt: off
SPACY_TOKENIZER_TESTS = [
("있다.", "있다 ."),

View File

View File

@ -0,0 +1,15 @@
import pytest
LG_BASIC_TOKENIZATION_TESTS = [
(
"Abooluganda abemmamba ababiri",
["Abooluganda", "abemmamba", "ababiri"],
),
]
@pytest.mark.parametrize("text,expected_tokens", LG_BASIC_TOKENIZATION_TESTS)
def test_lg_tokenizer_basic(lg_tokenizer, text, expected_tokens):
tokens = lg_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list

View File

@ -1,5 +1,6 @@
from spacy.tokens import Doc
import pytest
from spacy.tokens import Doc
from spacy.util import filter_spans
@pytest.fixture
@ -207,3 +208,18 @@ def test_chunking(nl_sample, nl_reference_chunking):
"""
chunks = [s.text.lower() for s in nl_sample.noun_chunks]
assert chunks == nl_reference_chunking
@pytest.mark.issue(10846)
def test_no_overlapping_chunks(nl_vocab):
# fmt: off
doc = Doc(
nl_vocab,
words=["Dit", "programma", "wordt", "beschouwd", "als", "'s", "werelds", "eerste", "computerprogramma"],
deps=["det", "nsubj:pass", "aux:pass", "ROOT", "mark", "det", "fixed", "amod", "xcomp"],
heads=[1, 3, 3, 3, 8, 8, 5, 8, 3],
pos=["DET", "NOUN", "AUX", "VERB", "SCONJ", "DET", "NOUN", "ADJ", "NOUN"],
)
# fmt: on
chunks = list(doc.noun_chunks)
assert filter_spans(chunks) == chunks

View File

@ -2,6 +2,9 @@ import pytest
from spacy.tokens import Doc
pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
def test_ru_doc_lemmatization(ru_lemmatizer):
words = ["мама", "мыла", "раму"]
pos = ["NOUN", "VERB", "NOUN"]

View File

@ -20,7 +20,6 @@ od katerih so te svoboščine odvisne,
assert len(tokens) == 116
@pytest.mark.xfail
def test_ordinal_number(sl_tokenizer):
text = "10. decembra 1948"
tokens = sl_tokenizer(text)

View File

@ -26,14 +26,6 @@ def test_attrs_idempotence(text):
assert intify_attrs(int_attrs) == {LEMMA: 10, IS_ALPHA: True}
@pytest.mark.parametrize("text", ["dog"])
def test_attrs_do_deprecated(text):
int_attrs = intify_attrs(
{"F": text, "is_alpha": True}, strings_map={text: 10}, _do_deprecated=True
)
assert int_attrs == {ORTH: 10, IS_ALPHA: True}
def test_attrs_ent_iob_intify():
int_attrs = intify_attrs({"ENT_IOB": ""})
assert int_attrs == {ENT_IOB: 0}

View File

@ -1,6 +1,10 @@
import pytest
from spacy.tokens import Doc
pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
def test_uk_lemmatizer(uk_lemmatizer):
"""Check that the default uk lemmatizer runs."""
doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])

View File

@ -1,4 +1,5 @@
import pytest
import warnings
import srsly
from mock import Mock
@ -344,13 +345,13 @@ def test_phrase_matcher_validation(en_vocab):
matcher.add("TEST1", [doc1])
with pytest.warns(UserWarning):
matcher.add("TEST2", [doc2])
with pytest.warns(None) as record:
with warnings.catch_warnings():
warnings.simplefilter("error")
matcher.add("TEST3", [doc3])
assert not record.list
matcher = PhraseMatcher(en_vocab, attr="POS", validate=True)
with pytest.warns(None) as record:
with warnings.catch_warnings():
warnings.simplefilter("error")
matcher.add("TEST4", [doc2])
assert not record.list
def test_attr_validation(en_vocab):

View File

@ -4,8 +4,8 @@ from pathlib import Path
def test_build_dependencies():
# Check that library requirements are pinned exactly the same across different setup files.
# TODO: correct checks for numpy rather than ignoring
libs_ignore_requirements = [
"cython",
"pytest",
"pytest-timeout",
"mock",
@ -21,7 +21,7 @@ def test_build_dependencies():
# ignore language-specific packages that shouldn't be installed by all
libs_ignore_setup = [
"fugashi",
"natto-py",
"mecab-ko",
"pythainlp",
"sudachipy",
"sudachidict_core",

View File

@ -1049,6 +1049,10 @@ def test_no_gold_ents(patterns):
for eg in train_examples:
eg.predicted = ruler(eg.predicted)
# Entity ruler is no longer needed (initialization below wipes out the
# patterns and causes warnings)
nlp.remove_pipe("entity_ruler")
def create_kb(vocab):
# create artificial KB
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)

View File

@ -659,3 +659,14 @@ def test_multiprocessing_gpu_warning(nlp2, texts):
# Trigger multi-processing.
for _ in docs:
pass
def test_dot_in_factory_names(nlp):
Language.component("my_evil_component", func=evil_component)
nlp.add_pipe("my_evil_component")
with pytest.raises(ValueError, match="not permitted"):
Language.component("my.evil.component.v1", func=evil_component)
with pytest.raises(ValueError, match="not permitted"):
Language.factory("my.evil.component.v1", func=evil_component)

View File

@ -33,6 +33,9 @@ URLS_SHOULD_MATCH = [
"http://userid:password@example.com/",
"http://142.42.1.1/",
"http://142.42.1.1:8080/",
"http://10.140.12.13/foo",
"http://10.140.12.13/foo/bar?arg1=baz&arg2=taz",
"http://10.1.1.1",
"http://foo.com/blah_(wikipedia)#cite-1",
"http://foo.com/blah_(wikipedia)_blah#cite-1",
"http://foo.com/unicode_(✪)_in_parens",
@ -94,6 +97,7 @@ URLS_SHOULD_NOT_MATCH = [
"http://foo.bar/foo(bar)baz quux",
"http://-error-.invalid/",
"http://a.b-.co",
# Loopback and broadcast addresses should be excluded
"http://0.0.0.0",
"http://10.1.1.0",
"http://10.1.1.255",
@ -102,7 +106,6 @@ URLS_SHOULD_NOT_MATCH = [
"http://3628126748",
"http://.www.foo.bar/",
"http://.www.foo.bar./",
"http://10.1.1.1",
"NASDAQ:GOOG",
"http://-a.b.co",
pytest.param("foo.com", marks=pytest.mark.xfail()),

View File

@ -431,3 +431,41 @@ def test_Example_aligned_whitespace(en_vocab):
example = Example(predicted, reference)
assert example.get_aligned("TAG", as_string=True) == tags
@pytest.mark.issue("11260")
def test_issue11260():
annots = {
"words": ["I", "like", "New", "York", "."],
"spans": {
"cities": [(7, 15, "LOC", "")],
"people": [(0, 1, "PERSON", "")],
},
}
vocab = Vocab()
predicted = Doc(vocab, words=annots["words"])
example = Example.from_dict(predicted, annots)
assert len(example.reference.spans["cities"]) == 1
assert len(example.reference.spans["people"]) == 1
output_dict = example.to_dict()
assert "spans" in output_dict["doc_annotation"]
assert output_dict["doc_annotation"]["spans"]["cities"] == annots["spans"]["cities"]
assert output_dict["doc_annotation"]["spans"]["people"] == annots["spans"]["people"]
output_example = Example.from_dict(predicted, output_dict)
assert len(output_example.reference.spans["cities"]) == len(
example.reference.spans["cities"]
)
assert len(output_example.reference.spans["people"]) == len(
example.reference.spans["people"]
)
for span in example.reference.spans["cities"]:
assert span.label_ == "LOC"
assert span.text == "New York"
assert span.start_char == 7
for span in example.reference.spans["people"]:
assert span.label_ == "PERSON"
assert span.text == "I"
assert span.start_char == 0

View File

@ -23,11 +23,7 @@ cdef class Tokenizer:
cdef object _infix_finditer
cdef object _rules
cdef PhraseMatcher _special_matcher
# TODO convert to bool in v4
cdef int _faster_heuristics
# TODO next one is unused and should be removed in v4
# https://github.com/explosion/spaCy/pull/9150
cdef int _unused_int2
cdef bint _faster_heuristics
cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
cdef int _apply_special_cases(self, Doc doc) except -1

View File

@ -8,7 +8,6 @@ from preshed.maps cimport PreshMap
cimport cython
import re
import warnings
from .tokens.doc cimport Doc
from .strings cimport hash_string
@ -16,9 +15,9 @@ from .lexeme cimport EMPTY_LEXEME
from .attrs import intify_attrs
from .symbols import ORTH, NORM
from .errors import Errors, Warnings
from .errors import Errors
from . import util
from .util import registry, get_words_and_spaces
from .util import get_words_and_spaces
from .attrs import intify_attrs
from .symbols import ORTH
from .scorer import Scorer
@ -128,10 +127,10 @@ cdef class Tokenizer:
property faster_heuristics:
def __get__(self):
return bool(self._faster_heuristics)
return self._faster_heuristics
def __set__(self, faster_heuristics):
self._faster_heuristics = bool(faster_heuristics)
self._faster_heuristics = faster_heuristics
self._reload_special_cases()
def __reduce__(self):
@ -582,7 +581,7 @@ cdef class Tokenizer:
substrings (iterable): A sequence of dicts, where each dict describes
a token and its attributes.
"""
attrs = [intify_attrs(spec, _do_deprecated=True) for spec in substrings]
attrs = [intify_attrs(spec) for spec in substrings]
orth = "".join([spec[ORTH] for spec in attrs])
if chunk != orth:
raise ValueError(Errors.E997.format(chunk=chunk, orth=orth, token_attrs=substrings))
@ -650,7 +649,7 @@ cdef class Tokenizer:
url_match = re.compile("a^").match
special_cases = {}
for orth, special_tokens in self.rules.items():
special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens]
special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings) for special_token in special_tokens]
tokens = []
for substring in text.split():
suffixes = []

View File

@ -809,27 +809,33 @@ cdef class Doc:
self.c[i].ent_iob = 1
self.c[i].ent_type = span.label
self.c[i].ent_kb_id = span.kb_id
# for backwards compatibility in v3, only set ent_id from
# span.id if it's set, otherwise don't override
self.c[i].ent_id = span.id if span.id else self.c[i].ent_id
self.c[i].ent_id = span.id
for span in blocked:
for i in range(span.start, span.end):
self.c[i].ent_iob = 3
self.c[i].ent_type = 0
self.c[i].ent_kb_id = 0
self.c[i].ent_id = 0
for span in missing:
for i in range(span.start, span.end):
self.c[i].ent_iob = 0
self.c[i].ent_type = 0
self.c[i].ent_kb_id = 0
self.c[i].ent_id = 0
for span in outside:
for i in range(span.start, span.end):
self.c[i].ent_iob = 2
self.c[i].ent_type = 0
self.c[i].ent_kb_id = 0
self.c[i].ent_id = 0
# Set tokens outside of all provided spans
if default != SetEntsDefault.unmodified:
for i in range(self.length):
if i not in seen_tokens:
self.c[i].ent_type = 0
self.c[i].ent_kb_id = 0
self.c[i].ent_id = 0
if default == SetEntsDefault.outside:
self.c[i].ent_iob = 2
elif default == SetEntsDefault.missing:
@ -1603,13 +1609,30 @@ cdef class Doc:
ents.append(char_span)
self.ents = ents
# Add custom attributes. Note that only Doc extensions are currently considered, Token and Span extensions are
# not yet supported.
# Add custom attributes for the whole Doc object.
for attr in doc_json.get("_", {}):
if not Doc.has_extension(attr):
Doc.set_extension(attr)
self._.set(attr, doc_json["_"][attr])
if doc_json.get("underscore_token", {}):
for token_attr in doc_json["underscore_token"]:
token_start = doc_json["underscore_token"][token_attr]["token_start"]
value = doc_json["underscore_token"][token_attr]["value"]
if not Token.has_extension(token_attr):
Token.set_extension(token_attr)
self[token_start]._.set(token_attr, value)
if doc_json.get("underscore_span", {}):
for span_attr in doc_json["underscore_span"]:
token_start = doc_json["underscore_span"][span_attr]["token_start"]
token_end = doc_json["underscore_span"][span_attr]["token_end"]
value = doc_json["underscore_span"][span_attr]["value"]
if not Span.has_extension(span_attr):
Span.set_extension(span_attr)
self[token_start:token_end]._.set(span_attr, value)
return self
def to_json(self, underscore=None):
@ -1651,20 +1674,40 @@ cdef class Doc:
for span_group in self.spans:
data["spans"][span_group] = []
for span in self.spans[span_group]:
span_data = {
"start": span.start_char, "end": span.end_char, "label": span.label_, "kb_id": span.kb_id_
}
span_data = {"start": span.start_char, "end": span.end_char, "label": span.label_, "kb_id": span.kb_id_}
data["spans"][span_group].append(span_data)
if underscore:
data["_"] = {}
user_keys = set()
if self.user_data:
data["_"] = {}
data["underscore_token"] = {}
data["underscore_span"] = {}
for data_key in self.user_data:
if type(data_key) == tuple and len(data_key) >= 4 and data_key[0] == "._.":
attr = data_key[1]
start = data_key[2]
end = data_key[3]
if attr in underscore:
user_keys.add(attr)
value = self.user_data[data_key]
if not srsly.is_json_serializable(value):
raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
# Check if doc attribute
if start is None:
data["_"][attr] = value
# Check if token attribute
elif end is None:
if attr not in data["underscore_token"]:
data["underscore_token"][attr] = {"token_start": start, "value": value}
# Else span attribute
else:
if attr not in data["underscore_span"]:
data["underscore_span"][attr] = {"token_start": start, "token_end": end, "value": value}
for attr in underscore:
if not self.has_extension(attr):
if attr not in user_keys:
raise ValueError(Errors.E106.format(attr=attr, opts=underscore))
value = self._.get(attr)
if not srsly.is_json_serializable(value):
raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
data["_"][attr] = value
return data
def to_utf8_array(self, int nr_char=-1):

View File

@ -115,17 +115,23 @@ class Span:
end: int
start_char: int
end_char: int
label: int
kb_id: int
ent_id: int
ent_id_: str
@property
def label(self) -> int: ...
@property
def kb_id(self) -> int: ...
@property
def id(self) -> int: ...
@property
def id_(self) -> str: ...
def ent_id(self) -> int: ...
@property
def orth_(self) -> str: ...
@property
def lemma_(self) -> str: ...
label_: str
kb_id_: str
@property
def label_(self) -> str: ...
@property
def kb_id_(self) -> str: ...
@property
def id_(self) -> str: ...
@property
def ent_id_(self) -> str: ...

View File

@ -802,28 +802,18 @@ cdef class Span:
property id:
def __get__(self):
cdef SpanC* span_c = self.span_c()
return span_c.id
return self.span_c().id
def __set__(self, attr_t id):
cdef SpanC* span_c = self.span_c()
span_c.id = id
self.span_c().id = id
property ent_id:
"""RETURNS (uint64): The entity ID."""
"""Alias for the span's ID."""
def __get__(self):
return self.root.ent_id
return self.id
def __set__(self, hash_t key):
raise NotImplementedError(Errors.E200.format(attr="ent_id"))
property ent_id_:
"""RETURNS (str): The (string) entity ID."""
def __get__(self):
return self.root.ent_id_
def __set__(self, str key):
raise NotImplementedError(Errors.E200.format(attr="ent_id_"))
def __set__(self, attr_t ent_id):
self.id = ent_id
@property
def orth_(self):
@ -839,7 +829,7 @@ cdef class Span:
return "".join([t.lemma_ + t.whitespace_ for t in self]).strip()
property label_:
"""RETURNS (str): The span's label."""
"""The span's label."""
def __get__(self):
return self.doc.vocab.strings[self.label]
@ -847,7 +837,7 @@ cdef class Span:
self.label = self.doc.vocab.strings.add(label_)
property kb_id_:
"""RETURNS (str): The span's KB ID."""
"""The span's KB ID."""
def __get__(self):
return self.doc.vocab.strings[self.kb_id]
@ -855,13 +845,22 @@ cdef class Span:
self.kb_id = self.doc.vocab.strings.add(kb_id_)
property id_:
"""RETURNS (str): The span's ID."""
"""The span's ID."""
def __get__(self):
return self.doc.vocab.strings[self.id]
def __set__(self, str id_):
self.id = self.doc.vocab.strings.add(id_)
property ent_id_:
"""Alias for the span's ID."""
def __get__(self):
return self.id_
def __set__(self, str ent_id_):
self.id_ = ent_id_
cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
# Don't allow spaces to be the root, if there are

View File

@ -361,6 +361,7 @@ cdef class Example:
"doc_annotation": {
"cats": dict(self.reference.cats),
"entities": doc_to_biluo_tags(self.reference),
"spans": self._spans_to_dict(),
"links": self._links_to_dict()
},
"token_annotation": {
@ -376,6 +377,18 @@ cdef class Example:
}
}
def _spans_to_dict(self):
span_dict = {}
for key in self.reference.spans:
span_tuples = []
for span in self.reference.spans[key]:
span_tuple = (span.start_char, span.end_char, span.label_, span.kb_id_)
span_tuples.append(span_tuple)
span_dict[key] = span_tuples
return span_dict
def _links_to_dict(self):
links = {}
for ent in self.reference.ents:

View File

@ -337,3 +337,5 @@ def ensure_shape(vectors_loc):
# store all the results in a list in memory
lines2 = open_file(vectors_loc)
yield from lines2
lines2.close()
lines.close()

View File

@ -795,6 +795,15 @@ def get_model_lower_version(constraint: str) -> Optional[str]:
return None
def is_prerelease_version(version: str) -> bool:
"""Check whether a version is a prerelease version.
version (str): The version, e.g. "3.0.0.dev1".
RETURNS (bool): Whether the version is a prerelease version.
"""
return Version(version).is_prerelease
def get_base_version(version: str) -> str:
"""Generate the base version without any prerelease identifiers.

View File

@ -32,7 +32,6 @@ cdef class Vocab:
cdef public object writing_system
cdef public object get_noun_chunks
cdef readonly int length
cdef public object _unused_object # TODO remove in v4, see #9150
cdef public object lex_attr_getters
cdef public object cfg

View File

@ -72,7 +72,6 @@ def unpickle_vocab(
sstore: StringStore,
vectors: Any,
morphology: Any,
_unused_object: Any,
lex_attr_getters: Any,
lookups: Any,
get_noun_chunks: Any,

View File

@ -268,8 +268,7 @@ cdef class Vocab:
cdef int i
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
for i, props in enumerate(substrings):
props = intify_attrs(props, strings_map=self.strings,
_do_deprecated=True)
props = intify_attrs(props, strings_map=self.strings)
token = &tokens[i]
# Set the special tokens up to have arbitrary attributes
lex = <LexemeC*>self.get_by_orth(self.mem, props[ORTH])
@ -559,21 +558,18 @@ def pickle_vocab(vocab):
sstore = vocab.strings
vectors = vocab.vectors
morph = vocab.morphology
_unused_object = vocab._unused_object
lex_attr_getters = srsly.pickle_dumps(vocab.lex_attr_getters)
lookups = vocab.lookups
get_noun_chunks = vocab.get_noun_chunks
return (unpickle_vocab,
(sstore, vectors, morph, _unused_object, lex_attr_getters, lookups, get_noun_chunks))
(sstore, vectors, morph, lex_attr_getters, lookups, get_noun_chunks))
def unpickle_vocab(sstore, vectors, morphology, _unused_object,
lex_attr_getters, lookups, get_noun_chunks):
def unpickle_vocab(sstore, vectors, morphology, lex_attr_getters, lookups, get_noun_chunks):
cdef Vocab vocab = Vocab()
vocab.vectors = vectors
vocab.strings = sstore
vocab.morphology = morphology
vocab._unused_object = _unused_object
vocab.lex_attr_getters = srsly.pickle_loads(lex_attr_getters)
vocab.lookups = lookups
vocab.get_noun_chunks = get_noun_chunks

View File

@ -1,7 +1,7 @@
---
title: AttributeRuler
tag: class
source: spacy/pipeline/attributeruler.py
source: spacy/pipeline/attribute_ruler.py
new: 3
teaser: 'Pipeline component for rule-based token attribute assignment'
api_string_name: attribute_ruler
@ -34,7 +34,7 @@ how the component should be configured. You can override its settings via the
| `validate` | Whether patterns should be validated (passed to the `Matcher`). Defaults to `False`. ~~bool~~ |
```python
%%GITHUB_SPACY/spacy/pipeline/attributeruler.py
%%GITHUB_SPACY/spacy/pipeline/attribute_ruler.py
```
## AttributeRuler.\_\_init\_\_ {#init tag="method"}

View File

@ -395,12 +395,13 @@ file to keep track of your settings and hyperparameters and your own
> "pos": List[str],
> "morphs": List[str],
> "sent_starts": List[Optional[bool]],
> "deps": List[string],
> "deps": List[str],
> "heads": List[int],
> "entities": List[str],
> "entities": List[(int, int, str)],
> "cats": Dict[str, float],
> "links": Dict[(int, int), dict],
> "spans": Dict[str, List[Tuple]],
> }
> ```
@ -417,9 +418,10 @@ file to keep track of your settings and hyperparameters and your own
| `deps` | List of string values indicating the [dependency relation](/usage/linguistic-features#dependency-parse) of a token to its head. ~~List[str]~~ |
| `heads` | List of integer values indicating the dependency head of each token, referring to the absolute index of each token in the text. ~~List[int]~~ |
| `entities` | **Option 1:** List of [BILUO tags](/usage/linguistic-features#accessing-ner) per token of the format `"{action}-{label}"`, or `None` for unannotated tokens. ~~List[str]~~ |
| `entities` | **Option 2:** List of `"(start, end, label)"` tuples defining all entities in the text. ~~List[Tuple[int, int, str]]~~ |
| `entities` | **Option 2:** List of `(start_char, end_char, label)` tuples defining all entities in the text. ~~List[Tuple[int, int, str]]~~ |
| `cats` | Dictionary of `label`/`value` pairs indicating how relevant a certain [text category](/api/textcategorizer) is for the text. ~~Dict[str, float]~~ |
| `links` | Dictionary of `offset`/`dict` pairs defining [named entity links](/usage/linguistic-features#entity-linking). The character offsets are linked to a dictionary of relevant knowledge base IDs. ~~Dict[Tuple[int, int], Dict]~~ |
| `spans` | Dictionary of `spans_key`/`List[Tuple]` pairs defining the spans for each spans key as `(start_char, end_char, label, kb_id)` tuples. ~~Dict[str, List[Tuple[int, int, str, str]]~~ |
<Infobox title="Notes and caveats">

View File

@ -158,10 +158,10 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and
## DependencyParser.initialize {#initialize tag="method" new="3"}
Initialize the component for training. `get_examples` should be a function that
returns an iterable of [`Example`](/api/example) objects. The data examples are
used to **initialize the model** of the component and can either be the full
training data or a representative sample. Initialization includes validating the
network,
returns an iterable of [`Example`](/api/example) objects. **At least one example
should be supplied.** The data examples are used to **initialize the model** of
the component and can either be the full training data or a representative
sample. Initialization includes validating the network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data. This method is typically called
by [`Language.initialize`](/api/language#initialize) and lets you customize
@ -179,7 +179,7 @@ This method was previously called `begin_training`.
>
> ```python
> parser = nlp.add_pipe("parser")
> parser.initialize(lambda: [], nlp=nlp)
> parser.initialize(lambda: examples, nlp=nlp)
> ```
>
> ```ini
@ -193,7 +193,7 @@ This method was previously called `begin_training`.
| Name | Description |
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | |
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Dict[str, Dict[str, int]]]~~ |

View File

@ -142,10 +142,10 @@ and [`pipe`](/api/edittreelemmatizer#pipe) delegate to the
## EditTreeLemmatizer.initialize {#initialize tag="method" new="3"}
Initialize the component for training. `get_examples` should be a function that
returns an iterable of [`Example`](/api/example) objects. The data examples are
used to **initialize the model** of the component and can either be the full
training data or a representative sample. Initialization includes validating the
network,
returns an iterable of [`Example`](/api/example) objects. **At least one example
should be supplied.** The data examples are used to **initialize the model** of
the component and can either be the full training data or a representative
sample. Initialization includes validating the network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data. This method is typically called
by [`Language.initialize`](/api/language#initialize) and lets you customize
@ -157,7 +157,7 @@ config.
>
> ```python
> lemmatizer = nlp.add_pipe("trainable_lemmatizer", name="lemmatizer")
> lemmatizer.initialize(lambda: [], nlp=nlp)
> lemmatizer.initialize(lambda: examples, nlp=nlp)
> ```
>
> ```ini
@ -171,7 +171,7 @@ config.
| Name | Description |
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | |
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ |

View File

@ -186,10 +186,10 @@ with the current vocab.
## EntityLinker.initialize {#initialize tag="method" new="3"}
Initialize the component for training. `get_examples` should be a function that
returns an iterable of [`Example`](/api/example) objects. The data examples are
used to **initialize the model** of the component and can either be the full
training data or a representative sample. Initialization includes validating the
network,
returns an iterable of [`Example`](/api/example) objects. **At least one example
should be supplied.** The data examples are used to **initialize the model** of
the component and can either be the full training data or a representative
sample. Initialization includes validating the network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data. This method is typically called
by [`Language.initialize`](/api/language#initialize).
@ -209,15 +209,15 @@ This method was previously called `begin_training`.
>
> ```python
> entity_linker = nlp.add_pipe("entity_linker")
> entity_linker.initialize(lambda: [], nlp=nlp, kb_loader=my_kb)
> entity_linker.initialize(lambda: examples, nlp=nlp, kb_loader=my_kb)
> ```
| Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | |
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
| `kb_loader` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. ~~Callable[[Vocab], KnowledgeBase]~~ |
| Name | Description |
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | |
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
| `kb_loader` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. ~~Callable[[Vocab], KnowledgeBase]~~ |
## EntityLinker.predict {#predict tag="method"}

View File

@ -154,10 +154,10 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and
## EntityRecognizer.initialize {#initialize tag="method" new="3"}
Initialize the component for training. `get_examples` should be a function that
returns an iterable of [`Example`](/api/example) objects. The data examples are
used to **initialize the model** of the component and can either be the full
training data or a representative sample. Initialization includes validating the
network,
returns an iterable of [`Example`](/api/example) objects. **At least one example
should be supplied.** The data examples are used to **initialize the model** of
the component and can either be the full training data or a representative
sample. Initialization includes validating the network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data. This method is typically called
by [`Language.initialize`](/api/language#initialize) and lets you customize
@ -175,7 +175,7 @@ This method was previously called `begin_training`.
>
> ```python
> ner = nlp.add_pipe("ner")
> ner.initialize(lambda: [], nlp=nlp)
> ner.initialize(lambda: examples, nlp=nlp)
> ```
>
> ```ini
@ -189,7 +189,7 @@ This method was previously called `begin_training`.
| Name | Description |
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | |
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Dict[str, Dict[str, int]]]~~ |

View File

@ -1,7 +1,7 @@
---
title: EntityRuler
tag: class
source: spacy/pipeline/entityruler.py
source: spacy/pipeline/entity_ruler.py
new: 2.1
teaser: 'Pipeline component for rule-based named entity recognition'
api_string_name: entity_ruler
@ -64,7 +64,7 @@ how the component should be configured. You can override its settings via the
| `scorer` | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~ |
```python
%%GITHUB_SPACY/spacy/pipeline/entityruler.py
%%GITHUB_SPACY/spacy/pipeline/entity_ruler.py
```
## EntityRuler.\_\_init\_\_ {#init tag="method"}

View File

@ -70,7 +70,7 @@ lemmatizer is available. The lemmatizer modes `rule` and `pos_lookup` require
[`token.pos`](/api/token) from a previous pipeline component (see example
pipeline configurations in the
[pretrained pipeline design details](/models#design-cnn)) or rely on third-party
libraries (`pymorphy2`).
libraries (`pymorphy3`).
| Language | Default Mode |
| -------- | ------------ |
@ -86,9 +86,9 @@ libraries (`pymorphy2`).
| `nb` | `rule` |
| `nl` | `rule` |
| `pl` | `pos_lookup` |
| `ru` | `pymorphy2` |
| `ru` | `pymorphy3` |
| `sv` | `rule` |
| `uk` | `pymorphy2` |
| `uk` | `pymorphy3` |
```python
%%GITHUB_SPACY/spacy/pipeline/lemmatizer.py

View File

@ -148,10 +148,10 @@ applied to the `Doc` in order. Both [`__call__`](/api/morphologizer#call) and
## Morphologizer.initialize {#initialize tag="method"}
Initialize the component for training. `get_examples` should be a function that
returns an iterable of [`Example`](/api/example) objects. The data examples are
used to **initialize the model** of the component and can either be the full
training data or a representative sample. Initialization includes validating the
network,
returns an iterable of [`Example`](/api/example) objects. **At least one example
should be supplied.** The data examples are used to **initialize the model** of
the component and can either be the full training data or a representative
sample. Initialization includes validating the network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data. This method is typically called
by [`Language.initialize`](/api/language#initialize) and lets you customize
@ -163,7 +163,7 @@ config.
>
> ```python
> morphologizer = nlp.add_pipe("morphologizer")
> morphologizer.initialize(lambda: [], nlp=nlp)
> morphologizer.initialize(lambda: examples, nlp=nlp)
> ```
>
> ```ini
@ -177,7 +177,7 @@ config.
| Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | |
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |

View File

@ -133,10 +133,10 @@ and [`pipe`](/api/sentencerecognizer#pipe) delegate to the
## SentenceRecognizer.initialize {#initialize tag="method"}
Initialize the component for training. `get_examples` should be a function that
returns an iterable of [`Example`](/api/example) objects. The data examples are
used to **initialize the model** of the component and can either be the full
training data or a representative sample. Initialization includes validating the
network,
returns an iterable of [`Example`](/api/example) objects. **At least one example
should be supplied.** The data examples are used to **initialize the model** of
the component and can either be the full training data or a representative
sample. Initialization includes validating the network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data. This method is typically called
by [`Language.initialize`](/api/language#initialize).
@ -145,14 +145,14 @@ by [`Language.initialize`](/api/language#initialize).
>
> ```python
> senter = nlp.add_pipe("senter")
> senter.initialize(lambda: [], nlp=nlp)
> senter.initialize(lambda: examples, nlp=nlp)
> ```
| Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | |
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
| Name | Description |
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | |
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
## SentenceRecognizer.predict {#predict tag="method"}

View File

@ -561,8 +561,8 @@ overlaps with will be returned.
| `lemma_` | The span's lemma. Equivalent to `"".join(token.text_with_ws for token in span)`. ~~str~~ |
| `kb_id` | The hash value of the knowledge base ID referred to by the span. ~~int~~ |
| `kb_id_` | The knowledge base ID referred to by the span. ~~str~~ |
| `ent_id` | The hash value of the named entity the root token is an instance of. ~~int~~ |
| `ent_id_` | The string ID of the named entity the root token is an instance of. ~~str~~ |
| `ent_id` | Alias for `id`: the hash value of the span's ID. ~~int~~ |
| `ent_id_` | Alias for `id_`: the span's ID. ~~str~~ |
| `id` | The hash value of the span's ID. ~~int~~ |
| `id_` | The span's ID. ~~str~~ |
| `sentiment` | A scalar value indicating the positivity or negativity of the span. ~~float~~ |

View File

@ -148,10 +148,10 @@ applied to the `Doc` in order. Both [`__call__`](/api/spancategorizer#call) and
## SpanCategorizer.initialize {#initialize tag="method"}
Initialize the component for training. `get_examples` should be a function that
returns an iterable of [`Example`](/api/example) objects. The data examples are
used to **initialize the model** of the component and can either be the full
training data or a representative sample. Initialization includes validating the
network,
returns an iterable of [`Example`](/api/example) objects. **At least one example
should be supplied.** The data examples are used to **initialize the model** of
the component and can either be the full training data or a representative
sample. Initialization includes validating the network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data. This method is typically called
by [`Language.initialize`](/api/language#initialize) and lets you customize
@ -163,7 +163,7 @@ config.
>
> ```python
> spancat = nlp.add_pipe("spancat")
> spancat.initialize(lambda: [], nlp=nlp)
> spancat.initialize(lambda: examples, nlp=nlp)
> ```
>
> ```ini
@ -177,7 +177,7 @@ config.
| Name | Description |
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | |
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ |

View File

@ -131,10 +131,10 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and
## Tagger.initialize {#initialize tag="method" new="3"}
Initialize the component for training. `get_examples` should be a function that
returns an iterable of [`Example`](/api/example) objects. The data examples are
used to **initialize the model** of the component and can either be the full
training data or a representative sample. Initialization includes validating the
network,
returns an iterable of [`Example`](/api/example) objects. **At least one example
should be supplied.** The data examples are used to **initialize the model** of
the component and can either be the full training data or a representative
sample. Initialization includes validating the network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data. This method is typically called
by [`Language.initialize`](/api/language#initialize) and lets you customize
@ -152,7 +152,7 @@ This method was previously called `begin_training`.
>
> ```python
> tagger = nlp.add_pipe("tagger")
> tagger.initialize(lambda: [], nlp=nlp)
> tagger.initialize(lambda: examples, nlp=nlp)
> ```
>
> ```ini
@ -166,7 +166,7 @@ This method was previously called `begin_training`.
| Name | Description |
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | |
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ |

View File

@ -84,6 +84,7 @@ architectures and their arguments and hyperparameters.
| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ |
| `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ |
| `scorer` | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ |
```python
%%GITHUB_SPACY/spacy/pipeline/textcat.py
@ -176,10 +177,10 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and
## TextCategorizer.initialize {#initialize tag="method" new="3"}
Initialize the component for training. `get_examples` should be a function that
returns an iterable of [`Example`](/api/example) objects. The data examples are
used to **initialize the model** of the component and can either be the full
training data or a representative sample. Initialization includes validating the
network,
returns an iterable of [`Example`](/api/example) objects. **At least one example
should be supplied.** The data examples are used to **initialize the model** of
the component and can either be the full training data or a representative
sample. Initialization includes validating the network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data. This method is typically called
by [`Language.initialize`](/api/language#initialize) and lets you customize
@ -197,7 +198,7 @@ This method was previously called `begin_training`.
>
> ```python
> textcat = nlp.add_pipe("textcat")
> textcat.initialize(lambda: [], nlp=nlp)
> textcat.initialize(lambda: examples, nlp=nlp)
> ```
>
> ```ini
@ -212,7 +213,7 @@ This method was previously called `begin_training`.
| Name | Description |
| ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | |
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ |

View File

@ -127,10 +127,10 @@ and [`set_annotations`](/api/tok2vec#set_annotations) methods.
Initialize the component for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
function that returns an iterable of [`Example`](/api/example) objects. The data
examples are used to **initialize the model** of the component and can either be
the full training data or a representative sample. Initialization includes
validating the network,
function that returns an iterable of [`Example`](/api/example) objects. **At
least one example should be supplied.** The data examples are used to
**initialize the model** of the component and can either be the full training
data or a representative sample. Initialization includes validating the network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data. This method is typically called
by [`Language.initialize`](/api/language#initialize).
@ -139,14 +139,14 @@ by [`Language.initialize`](/api/language#initialize).
>
> ```python
> tok2vec = nlp.add_pipe("tok2vec")
> tok2vec.initialize(lambda: [], nlp=nlp)
> tok2vec.initialize(lambda: examples, nlp=nlp)
> ```
| Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | |
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
| Name | Description |
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | |
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
## Tok2Vec.predict {#predict tag="method"}

View File

@ -425,8 +425,8 @@ The L2 norm of the token's vector representation.
| `ent_iob_` | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. ~~str~~ |
| `ent_kb_id` <Tag variant="new">2.2</Tag> | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~int~~ |
| `ent_kb_id_` <Tag variant="new">2.2</Tag> | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~str~~ |
| `ent_id` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~int~~ |
| `ent_id_` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~str~~ |
| `ent_id` | ID of the entity the token is an instance of, if any. ~~int~~ |
| `ent_id_` | ID of the entity the token is an instance of, if any. ~~str~~ |
| `lemma` | Base form of the token, with no inflectional suffixes. ~~int~~ |
| `lemma_` | Base form of the token, with no inflectional suffixes. ~~str~~ |
| `norm` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~int~~ |

View File

@ -240,7 +240,7 @@ browser. Will run a simple web server.
| Name | Description |
| --------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span]], Doc, Span]~~ |
| `style` | Visualization style, `"dep"`, `"ent"` or `"span"` <Tag variant="new">3.3</Tag>. Defaults to `"dep"`. ~~str~~ |
| `style` | Visualization style, `"dep"`, `"ent"` or `"span"` <Tag variant="new">3.3</Tag>. Defaults to `"dep"`. ~~str~~ |
| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ |
| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ |
@ -265,7 +265,7 @@ Render a dependency parse tree or named entity visualization.
| Name | Description |
| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span, dict]], Doc, Span, dict]~~ |
| `style` | Visualization style,`"dep"`, `"ent"` or `"span"` <Tag variant="new">3.3</Tag>. Defaults to `"dep"`. ~~str~~ |
| `style` | Visualization style, `"dep"`, `"ent"` or `"span"` <Tag variant="new">3.3</Tag>. Defaults to `"dep"`. ~~str~~ |
| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ |
| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ |
@ -273,6 +273,73 @@ Render a dependency parse tree or named entity visualization.
| `jupyter` | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None` (default). ~~Optional[bool]~~ |
| **RETURNS** | The rendered HTML markup. ~~str~~ |
### displacy.parse_deps {#displacy.parse_deps tag="method" new="2"}
Generate dependency parse in `{'words': [], 'arcs': []}` format.
For use with the `manual=True` argument in `displacy.render`.
> #### Example
>
> ```python
> import spacy
> from spacy import displacy
> nlp = spacy.load("en_core_web_sm")
> doc = nlp("This is a sentence.")
> deps_parse = displacy.parse_deps(doc)
> html = displacy.render(deps_parse, style="dep", manual=True)
> ```
| Name | Description |
| ----------- | ------------------------------------------------------------------- |
| `orig_doc` | Doc to parse dependencies. ~~Doc~~ |
| `options` | Dependency parse specific visualisation options. ~~Dict[str, Any]~~ |
| **RETURNS** | Generated dependency parse keyed by words and arcs. ~~dict~~ |
### displacy.parse_ents {#displacy.parse_ents tag="method" new="2"}
Generate named entities in `[{start: i, end: i, label: 'label'}]` format.
For use with the `manual=True` argument in `displacy.render`.
> #### Example
>
> ```python
> import spacy
> from spacy import displacy
> nlp = spacy.load("en_core_web_sm")
> doc = nlp("But Google is starting from behind.")
> ents_parse = displacy.parse_ents(doc)
> html = displacy.render(ents_parse, style="ent", manual=True)
> ```
| Name | Description |
| ----------- | ------------------------------------------------------------------- |
| `doc` | Doc to parse entities. ~~Doc~~ |
| `options` | NER-specific visualisation options. ~~Dict[str, Any]~~ |
| **RETURNS** | Generated entities keyed by text (original text) and ents. ~~dict~~ |
### displacy.parse_spans {#displacy.parse_spans tag="method" new="2"}
Generate spans in `[{start_token: i, end_token: i, label: 'label'}]` format.
For use with the `manual=True` argument in `displacy.render`.
> #### Example
>
> ```python
> import spacy
> from spacy import displacy
> nlp = spacy.load("en_core_web_sm")
> doc = nlp("But Google is starting from behind.")
> doc.spans['orgs'] = [doc[1:2]]
> ents_parse = displacy.parse_spans(doc, options={"spans_key" : "orgs"})
> html = displacy.render(ents_parse, style="span", manual=True)
> ```
| Name | Description |
| ----------- | ------------------------------------------------------------------- |
| `doc` | Doc to parse entities. ~~Doc~~ |
| `options` | Span-specific visualisation options. ~~Dict[str, Any]~~ |
| **RETURNS** | Generated entities keyed by text (original text) and ents. ~~dict~~ |
### Visualizer options {#displacy_options}
The `options` argument lets you specify additional settings for each visualizer.

View File

@ -175,10 +175,10 @@ applied to the `Doc` in order. Both [`__call__`](/api/transformer#call) and
Initialize the component for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
function that returns an iterable of [`Example`](/api/example) objects. The data
examples are used to **initialize the model** of the component and can either be
the full training data or a representative sample. Initialization includes
validating the network,
function that returns an iterable of [`Example`](/api/example) objects. **At
least one example should be supplied.** The data examples are used to
**initialize the model** of the component and can either be the full training
data or a representative sample. Initialization includes validating the network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data. This method is typically called
by [`Language.initialize`](/api/language#initialize).
@ -187,14 +187,14 @@ by [`Language.initialize`](/api/language#initialize).
>
> ```python
> trf = nlp.add_pipe("transformer")
> trf.initialize(lambda: iter([]), nlp=nlp)
> trf.initialize(lambda: examples, nlp=nlp)
> ```
| Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | |
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
| Name | Description |
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | |
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
## Transformer.predict {#predict tag="method"}

View File

@ -11,8 +11,8 @@ menu:
- ['Tokenization', 'tokenization']
- ['Merging & Splitting', 'retokenization']
- ['Sentence Segmentation', 'sbd']
- ['Vectors & Similarity', 'vectors-similarity']
- ['Mappings & Exceptions', 'mappings-exceptions']
- ['Vectors & Similarity', 'vectors-similarity']
- ['Language Data', 'language-data']
---

View File

@ -268,18 +268,49 @@ used for training the current [Japanese pipelines](/models/ja).
### Korean language support {#korean}
> #### mecab-ko tokenizer
There are currently three built-in options for Korean tokenization, two based on
[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md) and one
using the rule-based tokenizer.
> #### Default mecab-ko tokenizer
>
> ```python
> # uses mecab-ko-dic
> nlp = spacy.blank("ko")
>
> # with custom mecab args
> mecab_args = "-d /path/to/dicdir -u /path/to/userdic"
> config = {"nlp": {"tokenizer": {"mecab_args": mecab_args}}}
> nlp = spacy.blank("ko", config=config)
> ```
The default MeCab-based Korean tokenizer requires:
The default MeCab-based Korean tokenizer requires the python package
[`mecab-ko`](https://pypi.org/project/mecab-ko/) and no further system
requirements.
The `natto-py` MeCab-based tokenizer (the previous default for spaCy v3.4 and
earlier) is available as `spacy.KoreanNattoTokenizer.v1`. It requires:
- [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md)
- [mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic)
- [natto-py](https://github.com/buruzaemon/natto-py)
To use this tokenizer, edit `[nlp.tokenizer]` in your config:
> #### natto-py MeCab-ko tokenizer
>
> ```python
> config = {"nlp": {"tokenizer": {"@tokenizers": "spacy.KoreanNattoTokenizer.v1"}}}
> nlp = spacy.blank("ko", config=config)
> ```
```ini
### config.cfg
[nlp]
lang = "ko"
tokenizer = {"@tokenizers" = "spacy.KoreanNattoTokenizer.v1"}
```
For some Korean datasets and tasks, the
[rule-based tokenizer](/usage/linguistic-features#tokenization) is better-suited
than MeCab. To configure a Korean pipeline with the rule-based tokenizer:

View File

@ -1367,14 +1367,14 @@ patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"},
ruler.add_patterns(patterns)
doc1 = nlp("Apple is opening its first big office in San Francisco.")
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc1.ents])
print([(ent.text, ent.label_, ent.id_) for ent in doc1.ents])
doc2 = nlp("Apple is opening its first big office in San Fran.")
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc2.ents])
print([(ent.text, ent.label_, ent.id_) for ent in doc2.ents])
```
If the `id` attribute is included in the [`EntityRuler`](/api/entityruler)
patterns, the `ent_id_` property of the matched entity is set to the `id` given
patterns, the `id_` property of the matched entity is set to the `id` given
in the patterns. So in the example above it's easy to identify that "San
Francisco" and "San Fran" are both the same entity.

View File

@ -195,7 +195,7 @@ the data to and from a JSON file.
>
> To see custom serialization methods in action, check out the new
> [`EntityRuler`](/api/entityruler) component and its
> [source](%%GITHUB_SPACY/spacy/pipeline/entityruler.py). Patterns added to the
> [source](%%GITHUB_SPACY/spacy/pipeline/entity_ruler.py). Patterns added to the
> component will be saved to a `.jsonl` file if the pipeline is serialized to
> disk, and to a bytestring if the pipeline is serialized to bytes. This allows
> saving out a pipeline with a rule-based entity recognizer and including all

View File

@ -198,12 +198,12 @@ import DisplacySpanHtml from 'images/displacy-span.html'
The span visualizer lets you customize the following `options`:
| Argument | Description |
|-----------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|
| `spans_key` | Which spans key to render spans from. Default is `"sc"`. ~~str~~ |
| Argument | Description |
| ----------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `spans_key` | Which spans key to render spans from. Default is `"sc"`. ~~str~~ |
| `templates` | Dictionary containing the keys `"span"`, `"slice"`, and `"start"`. These dictate how the overall span, a span slice, and the starting token will be rendered. ~~Optional[Dict[str, str]~~ |
| `kb_url_template` | Optional template to construct the KB url for the entity to link to. Expects a python f-string format with single field to fill in ~~Optional[str]~~ |
| `colors` | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~ |
| `kb_url_template` | Optional template to construct the KB url for the entity to link to. Expects a python f-string format with single field to fill in ~~Optional[str]~~ |
| `colors` | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~ |
Because spans can be stored across different keys in `doc.spans`, you need to specify
which one displaCy should use with `spans_key` (`sc` is the default).
@ -343,9 +343,21 @@ want to visualize output from other libraries, like [NLTK](http://www.nltk.org)
or
[SyntaxNet](https://github.com/tensorflow/models/tree/master/research/syntaxnet).
If you set `manual=True` on either `render()` or `serve()`, you can pass in data
in displaCy's format as a dictionary (instead of `Doc` objects).
in displaCy's format as a dictionary (instead of `Doc` objects). There are helper
functions for converting `Doc` objects to displaCy's format for use with `manual=True`:
[`displacy.parse_deps`](/api/top-level#displacy.parse_deps),
[`displacy.parse_ents`](/api/top-level#displacy.parse_ents),
and [`displacy.parse_spans`](/api/top-level#displacy.parse_spans).
> #### Example
> #### Example with parse function
>
> ```python
> doc = nlp("But Google is starting from behind.")
> ex = displacy.parse_ents(doc)
> html = displacy.render(ex, style="ent", manual=True)
> ```
> #### Example with raw data
>
> ```python
> ex = [{"text": "But Google is starting from behind.",
@ -354,6 +366,7 @@ in displaCy's format as a dictionary (instead of `Doc` objects).
> html = displacy.render(ex, style="ent", manual=True)
> ```
```python
### DEP input
{
@ -389,6 +402,18 @@ in displaCy's format as a dictionary (instead of `Doc` objects).
}
```
```python
### SPANS input
{
"text": "Welcome to the Bank of China.",
"spans": [
{"start_token": 3, "end_token": 6, "label": "ORG"},
{"start_token": 5, "end_token": 6, "label": "GPE"},
],
"tokens": ["Welcome", "to", "the", "Bank", "of", "China", "."],
}
```
## Using displaCy in a web application {#webapp}
If you want to use the visualizers as part of a web application, for example to

View File

@ -265,6 +265,11 @@
"name": "Luxembourgish",
"has_examples": true
},
{
"code": "lg",
"name": "Luganda",
"has_examples": true
},
{
"code": "lij",
"name": "Ligurian",
@ -369,8 +374,8 @@
"has_examples": true,
"dependencies": [
{
"name": "pymorphy2",
"url": "https://github.com/kmike/pymorphy2"
"name": "pymorphy3",
"url": "https://github.com/no-plagiarism/pymorphy3"
}
],
"models": [
@ -467,10 +472,20 @@
"code": "uk",
"name": "Ukrainian",
"has_examples": true,
"models": [
"uk_core_news_sm",
"uk_core_news_md",
"uk_core_news_lg",
"uk_core_news_trf"
],
"dependencies": [
{
"name": "pymorphy2",
"url": "https://github.com/kmike/pymorphy2"
"name": "pymorphy3",
"url": "https://github.com/no-plagiarism/pymorphy3"
},
{
"name": "pymorphy3-dicts-uk",
"url": "https://github.com/no-plagiarism/pymorphy3-dicts"
}
]
},

View File

@ -1,5 +1,39 @@
{
"resources": [
{
"id": "concepcy",
"title": "concepCy",
"slogan": "A multilingual knowledge graph in spaCy",
"description": "A spaCy wrapper for ConceptNet, a freely-available semantic network designed to help computers understand the meaning of words.",
"github": "JulesBelveze/concepcy",
"pip": "concepcy",
"code_example": [
"import spacy",
"import concepcy",
"",
"nlp = spacy.load('en_core_web_sm')",
"# Using default concepCy configuration",
"nlp.add_pipe('concepcy')",
"",
"doc = nlp('WHO is a lovely company')",
"",
"# Access all the 'RelatedTo' relations from the Doc",
"for word, relations in doc._.relatedto.items():",
" print(f'Word: {word}\n{relations}')",
"",
"# Access the 'RelatedTo' relations word by word",
"for token in doc:",
" print(f'Word: {token}\n{token._.relatedto}')"
],
"category": ["pipeline"],
"image": "https://github.com/JulesBelveze/concepcy/blob/main/figures/concepcy.png",
"tags": ["semantic", "ConceptNet"],
"author": "Jules Belveze",
"author_links": {
"github": "JulesBelveze",
"website": "https://www.linkedin.com/in/jules-belveze/"
}
},
{
"id": "spacyfishing",
"title": "spaCy fishing",
@ -2604,7 +2638,7 @@
" Add the courgette, garlic, red peppers and oregano and cook for 23 minutes.",
" Later, add some oranges and chickens.\"\"\"",
"",
"# use any model that has internal spacy embeddings",
"# use any model that has internal spacy embeddings",
"nlp = spacy.load('en_core_web_lg')",
"nlp.add_pipe(\"concise_concepts\", ",
" config={\"data\": data}",
@ -2650,7 +2684,7 @@
" At that location, Nissin was founded.",
" Many students survived by eating these noodles, but they don't even know him.\"\"\"",
"",
"# use any model that has internal spacy embeddings",
"# use any model that has internal spacy embeddings",
"nlp = spacy.load('en_core_web_sm')",
"nlp.add_pipe(",
" \"xx_coref\", config={\"chunk_size\": 2500, \"chunk_overlap\": 2, \"device\": 0})",
@ -2833,7 +2867,7 @@
"doc = nlp(\"AE died in Princeton in 1955.\")",
"",
"print(doc._.clauses)",
"# Output:",
"# Output:",
"# <SV, AE, died, None, None, None, [in Princeton, in 1955]>",
"",
"propositions = doc._.clauses[0].to_propositions(as_text=True)",
@ -3599,7 +3633,7 @@
"",
"#Lexico Semantic (LxSem) Features",
"TTRF = LingFeat.TTRF_() #Type Token Ratio Features",
"VarF = LingFeat.VarF_() #Noun/Verb/Adj/Adv Variation Features",
"VarF = LingFeat.VarF_() #Noun/Verb/Adj/Adv Variation Features",
"PsyF = LingFeat.PsyF_() #Psycholinguistic Difficulty of Words (AoA Kuperman)",
"WoLF = LingFeat.WorF_() #Word Familiarity from Frequency Count (SubtlexUS)",
"",

View File

@ -114,7 +114,11 @@ function formatVectors(data) {
if (!data) return 'n/a'
if (Object.values(data).every(n => n === 0)) return 'context vectors only'
const { keys, vectors, width } = data
return `${abbrNum(keys)} keys, ${abbrNum(vectors)} unique vectors (${width} dimensions)`
if (keys >= 0) {
return `${abbrNum(keys)} keys, ${abbrNum(vectors)} unique vectors (${width} dimensions)`
} else {
return `${abbrNum(vectors)} floret vectors (${width} dimensions)`
}
}
function formatAccuracy(data, lang) {