Merge pull request #9673 from explosion/master

update develop branch for 3.3
This commit is contained in:
Sofie Van Landeghem 2021-11-15 11:14:49 +01:00 committed by GitHub
commit 4694b43d87
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
59 changed files with 1330 additions and 111 deletions

View File

@ -65,11 +65,8 @@ steps:
condition: eq(${{ parameters.gpu }}, true)
- script: |
#python -m spacy download ca_core_news_sm
#python -m spacy download ca_core_news_md
# temporarily install the v3.1.0 models
pip install --no-deps https://github.com/explosion/spacy-models/releases/download/ca_core_news_sm-3.1.0/ca_core_news_sm-3.1.0-py3-none-any.whl
pip install --no-deps https://github.com/explosion/spacy-models/releases/download/ca_core_news_md-3.1.0/ca_core_news_md-3.1.0-py3-none-any.whl
python -m spacy download ca_core_news_sm
python -m spacy download ca_core_news_md
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
displayName: 'Test download CLI'
condition: eq(variables['python_version'], '3.8')
@ -98,8 +95,7 @@ steps:
- script: |
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
# temporarily ignore W095
PYTHONWARNINGS="error,ignore:[W095]:UserWarning,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
displayName: 'Test assemble CLI'
condition: eq(variables['python_version'], '3.8')

View File

@ -16,7 +16,7 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy
model packaging, deployment and workflow management. spaCy is commercial
open-source software, released under the MIT license.
💫 **Version 3.0 out now!**
💫 **Version 3.2 out now!**
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
[![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)

View File

@ -16,8 +16,10 @@ gpu_allocator = null
[nlp]
lang = "{{ lang }}"
{%- set no_tok2vec = components|length == 1 and (("textcat" in components or "textcat_multilabel" in components) and optimize == "efficiency")-%}
{%- if not no_tok2vec and ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "entity_linker" in components or "textcat" in components or "textcat_multilabel" in components) -%}
{%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%}
{%- set with_accuracy = optimize == "accuracy" -%}
{%- set has_accurate_textcat = has_textcat and with_accuracy -%}
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "entity_linker" in components or has_accurate_textcat) -%}
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components %}
{%- else -%}
{%- set full_pipeline = components %}
@ -199,7 +201,7 @@ no_output_layer = false
{# NON-TRANSFORMER PIPELINE #}
{% else -%}
{% if not no_tok2vec-%}
{% if "tok2vec" in full_pipeline -%}
[components.tok2vec]
factory = "tok2vec"

View File

@ -1,18 +1,13 @@
import warnings
def add_codes(err_cls):
"""Add error codes to string messages via class attribute names."""
class ErrorsWithCodes(err_cls):
def __getattribute__(self, code):
msg = super(ErrorsWithCodes, self).__getattribute__(code)
if code.startswith("__"): # python system attributes like __class__
return msg
else:
return "[{code}] {msg}".format(code=code, msg=msg)
return ErrorsWithCodes()
class ErrorsWithCodes(type):
def __getattribute__(self, code):
msg = super().__getattribute__(code)
if code.startswith("__"): # python system attributes like __class__
return msg
else:
return "[{code}] {msg}".format(code=code, msg=msg)
def setup_default_warnings():
@ -47,8 +42,7 @@ def _escape_warning_msg(msg):
# fmt: off
@add_codes
class Warnings:
class Warnings(metaclass=ErrorsWithCodes):
W005 = ("Doc object not parsed. This means displaCy won't be able to "
"generate a dependency visualization for it. Make sure the Doc "
"was processed with a model that supports dependency parsing, and "
@ -199,8 +193,7 @@ class Warnings:
"Vectors are calculated from character ngrams.")
@add_codes
class Errors:
class Errors(metaclass=ErrorsWithCodes):
E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). "
"This usually happens when spaCy calls `nlp.{method}` with a custom "

View File

@ -1,58 +1,76 @@
from typing import Union, Iterator, Tuple
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
from ...tokens import Doc, Span, Token
from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
doc = doclike.doc
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
labels = [
"nsubj",
"nsubj:pass",
"obj",
"obl",
"nmod",
"pcomp",
"appos",
"ROOT",
]
post_modifiers = ["flat", "fixed", "compound"]
doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029)
if not len(doc):
return
np_deps = {doc.vocab.strings.add(label) for label in labels}
np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
np_label = doc.vocab.strings.add("NP")
left_labels = ["det", "fixed", "neg"] # ['nunmod', 'det', 'appos', 'fixed']
right_labels = ["flat", "fixed", "compound", "neg"]
stop_labels = ["punct"]
np_left_deps = [doc.vocab.strings.add(label) for label in left_labels]
np_right_deps = [doc.vocab.strings.add(label) for label in right_labels]
stop_deps = [doc.vocab.strings.add(label) for label in stop_labels]
adj_label = doc.vocab.strings.add("amod")
adp_label = doc.vocab.strings.add("ADP")
conj = doc.vocab.strings.add("conj")
conj_pos = doc.vocab.strings.add("CCONJ")
prev_end = -1
for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
right_childs = list(word.rights)
right_child = right_childs[0] if right_childs else None
prev_right = -1
for token in doclike:
if token.pos in [PROPN, NOUN, PRON]:
left, right = noun_bounds(
doc, token, np_left_deps, np_right_deps, stop_deps
)
if left.i <= prev_right:
continue
yield left.i, right.i + 1, np_label
prev_right = right.i
def is_verb_token(token: Token) -> bool:
return token.pos in [VERB, AUX]
def noun_bounds(doc, root, np_left_deps, np_right_deps, stop_deps):
left_bound = root
for token in reversed(list(root.lefts)):
if token.dep in np_left_deps:
left_bound = token
right_bound = root
for token in root.rights:
if token.dep in np_right_deps:
left, right = noun_bounds(
doc, token, np_left_deps, np_right_deps, stop_deps
)
filter_func = lambda t: is_verb_token(t) or t.dep in stop_deps
if list(filter(filter_func, doc[left_bound.i : right.i])):
break
if right_child:
if right_child.dep == adj_label:
right_end = right_child.right_edge
elif right_child.dep in np_modifs: # Check if we can expand to right
right_end = word.right_edge
else:
right_end = word
else:
right_bound = right
return left_bound, right_bound
right_end = word
prev_end = right_end.i
left_index = word.left_edge.i
left_index = (
left_index + 1 if word.left_edge.pos == adp_label else left_index
) # Eliminate left attached de, del
yield left_index, right_end.i + 1, np_label
elif word.dep == conj:
head = word.head
while head.dep == conj and head.head.i < head.i:
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
prev_end = word.i
left_index = word.left_edge.i # eliminate left attached conjunction
left_index = (
left_index + 1 if word.left_edge.pos == conj_pos else left_index
)
yield left_index, word.i + 1, np_label
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}

View File

@ -203,7 +203,11 @@ class Japanese(Language):
"extend": True,
"scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
},
default_score_weights={"pos_acc": 0.5, "morph_micro_f": 0.5, "morph_per_feat": None},
default_score_weights={
"pos_acc": 0.5,
"morph_micro_f": 0.5,
"morph_per_feat": None,
},
)
def make_morphologizer(
nlp: Language,

View File

@ -1,6 +1,7 @@
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
from ...language import Language, BaseDefaults
@ -10,6 +11,7 @@ class PortugueseDefaults(BaseDefaults):
infixes = TOKENIZER_INFIXES
prefixes = TOKENIZER_PREFIXES
lex_attr_getters = LEX_ATTRS
syntax_iterators = SYNTAX_ITERATORS
stop_words = STOP_WORDS

View File

@ -0,0 +1,85 @@
from typing import Union, Iterator, Tuple
from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
labels = [
"nsubj",
"nsubj:pass",
"obj",
"obl",
"obl:agent",
"nmod",
"pcomp",
"appos",
"ROOT",
]
post_modifiers = ["flat", "flat:name", "fixed", "compound"]
doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029)
np_deps = {doc.vocab.strings.add(label) for label in labels}
np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
np_label = doc.vocab.strings.add("NP")
adj_label = doc.vocab.strings.add("amod")
det_label = doc.vocab.strings.add("det")
det_pos = doc.vocab.strings.add("DET")
adp_label = doc.vocab.strings.add("ADP")
conj = doc.vocab.strings.add("conj")
conj_pos = doc.vocab.strings.add("CCONJ")
prev_end = -1
for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
right_childs = list(word.rights)
right_child = right_childs[0] if right_childs else None
if right_child:
if (
right_child.dep == adj_label
): # allow chain of adjectives by expanding to right
right_end = right_child.right_edge
elif (
right_child.dep == det_label and right_child.pos == det_pos
): # cut relative pronouns here
right_end = right_child
elif right_child.dep in np_modifs: # Check if we can expand to right
right_end = word.right_edge
else:
right_end = word
else:
right_end = word
prev_end = right_end.i
left_index = word.left_edge.i
left_index = (
left_index + 1 if word.left_edge.pos == adp_label else left_index
)
yield left_index, right_end.i + 1, np_label
elif word.dep == conj:
head = word.head
while head.dep == conj and head.head.i < head.i:
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
prev_end = word.i
left_index = word.left_edge.i # eliminate left attached conjunction
left_index = (
left_index + 1 if word.left_edge.pos == conj_pos else left_index
)
yield left_index, word.i + 1, np_label
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}

View File

@ -33,7 +33,9 @@ class RussianLemmatizer(Lemmatizer):
) from None
if getattr(self, "_morph", None) is None:
self._morph = MorphAnalyzer()
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer)
super().__init__(
vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
def pymorphy2_lemmatize(self, token: Token) -> List[str]:
string = token.text

View File

@ -27,7 +27,7 @@ _num_words = [
"ትሪልዮን",
"ኳድሪልዮን",
"ጋዚልዮን",
"ባዚልዮን"
"ባዚልዮን",
]
# Tigrinya ordinals above 10 are the same as _num_words but start with "መበል "
@ -41,7 +41,7 @@ _ordinal_words = [
"ሻውዓይ",
"ሻምናይ",
"ታሽዓይ",
"ዓስራይ"
"ዓስራይ",
]

View File

@ -29,4 +29,6 @@ class UkrainianLemmatizer(RussianLemmatizer):
) from None
if getattr(self, "_morph", None) is None:
self._morph = MorphAnalyzer(lang="uk")
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer)
super().__init__(
vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)

View File

@ -1,4 +1,3 @@
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.vi.examples import sentences

View File

@ -53,7 +53,7 @@ def build_hash_embed_cnn_tok2vec(
window_size (int): The number of tokens on either side to concatenate during
the convolutions. The receptive field of the CNN will be
depth * (window_size * 2 + 1), so a 4-layer network with window_size of
2 will be sensitive to 17 words at a time. Recommended value is 1.
2 will be sensitive to 20 words at a time. Recommended value is 1.
embed_size (int): The number of rows in the hash embedding tables. This can
be surprisingly small, due to the use of the hash embeddings. Recommended
values are between 2000 and 10000.

View File

@ -303,7 +303,9 @@ class Scorer:
pred_per_feat[field] = set()
pred_per_feat[field].add((gold_i, feat))
for field in per_feat:
micro_score.score_set(pred_per_feat.get(field, set()), gold_per_feat.get(field, set()))
micro_score.score_set(
pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
)
per_feat[field].score_set(
pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
)

View File

@ -120,6 +120,11 @@ def es_tokenizer():
return get_lang_class("es")().tokenizer
@pytest.fixture(scope="session")
def es_vocab():
return get_lang_class("es")().vocab
@pytest.fixture(scope="session")
def eu_tokenizer():
return get_lang_class("eu")().tokenizer
@ -247,6 +252,11 @@ def pt_tokenizer():
return get_lang_class("pt")().tokenizer
@pytest.fixture(scope="session")
def pt_vocab():
return get_lang_class("pt")().vocab
@pytest.fixture(scope="session")
def ro_tokenizer():
return get_lang_class("ro")().tokenizer
@ -290,6 +300,11 @@ def ti_tokenizer():
return get_lang_class("ti")().tokenizer
@pytest.fixture(scope="session")
def tl_tokenizer():
return get_lang_class("tl")().tokenizer
@pytest.fixture(scope="session")
def tr_tokenizer():
return get_lang_class("tr")().tokenizer

View File

@ -119,6 +119,7 @@ def test_en_tokenizer_splits_period_abbr(en_tokenizer):
assert tokens[4].text == "Mr."
@pytest.mark.issue(225)
@pytest.mark.xfail(reason="Issue #225 - not yet implemented")
def test_en_tokenizer_splits_em_dash_infix(en_tokenizer):
tokens = en_tokenizer(

View File

@ -1,6 +1,156 @@
from spacy.tokens import Doc
import pytest
# fmt: off
@pytest.mark.parametrize(
"words,heads,deps,pos,chunk_offsets",
[
# un gato -> "un gato"
(
["un", "gato"],
[1, 1],
["det", "ROOT"],
["DET", "NOUN"],
[(0, 2)],
),
# la camisa negra -> "la camisa negra"
(
["la", "camisa", "negra"],
[1, 1, 1],
["det", "ROOT", "amod"],
["DET", "NOUN", "ADJ"],
[(0, 3)],
),
# un lindo gatito -> "un lindo gatito"
(
["Un", "lindo", "gatito"],
[2, 2, 2],
["det", "amod", "ROOT"],
["DET", "ADJ", "NOUN"],
[(0,3)]
),
# una chica hermosa e inteligente -> una chica hermosa e inteligente
(
["Una", "chica", "hermosa", "e", "inteligente"],
[1, 1, 1, 4, 2],
["det", "ROOT", "amod", "cc", "conj"],
["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
[(0,5)]
),
# el fabuloso gato pardo -> "el fabuloso gato pardo"
(
["el", "fabuloso", "gato", "pardo"],
[2, 2, 2, 2],
["det", "amod", "ROOT", "amod"],
["DET", "ADJ", "NOUN", "ADJ"],
[(0,4)]
),
# Tengo un gato y un perro -> un gato, un perro
(
["Tengo", "un", "gato", "y", "un", "perro"],
[0, 2, 0, 5, 5, 0],
["ROOT", "det", "obj", "cc", "det", "conj"],
["VERB", "DET", "NOUN", "CCONJ", "DET", "NOUN"],
[(1,3), (4,6)]
),
# Dom Pedro II -> Dom Pedro II
(
["Dom", "Pedro", "II"],
[0, 0, 0],
["ROOT", "flat", "flat"],
["PROPN", "PROPN", "PROPN"],
[(0,3)]
),
# los Estados Unidos -> los Estados Unidos
(
["los", "Estados", "Unidos"],
[1, 1, 1],
["det", "ROOT", "flat"],
["DET", "PROPN", "PROPN"],
[(0,3)]
),
# Miguel de Cervantes -> Miguel de Cervantes
(
["Miguel", "de", "Cervantes"],
[0, 2, 0],
["ROOT", "case", "flat"],
["PROPN", "ADP", "PROPN"],
[(0,3)]
),
(
["Rio", "de", "Janeiro"],
[0, 2, 0],
["ROOT", "case", "flat"],
["PROPN", "ADP", "PROPN"],
[(0,3)]
),
# la destrucción de la ciudad -> la destrucción, la ciudad
(
["la", "destrucción", "de", "la", "ciudad"],
[1, 1, 4, 4, 1],
['det', 'ROOT', 'case', 'det', 'nmod'],
['DET', 'NOUN', 'ADP', 'DET', 'NOUN'],
[(0,2), (3,5)]
),
# la traducción de Susana del informe -> la traducción, Susana, informe
(
['la', 'traducción', 'de', 'Susana', 'del', 'informe'],
[1, 1, 3, 1, 5, 1],
['det', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
['DET', 'NOUN', 'ADP', 'PROPN', 'ADP', 'NOUN'],
[(0,2), (3,4), (5,6)]
),
# El gato regordete de Susana y su amigo -> el gato regordete, Susana, su amigo
(
['El', 'gato', 'regordete', 'de', 'Susana', 'y', 'su', 'amigo'],
[1, 1, 1, 4, 1, 7, 7, 1],
['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'conj'],
['DET', 'NOUN', 'ADJ', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'],
[(0,3), (4,5), (6,8)]
),
# Afirmó que sigue el criterio europeo y que trata de incentivar el mercado donde no lo hay -> el criterio europeo, el mercado, donde, lo
(
['Afirmó', 'que', 'sigue', 'el', 'criterio', 'europeo', 'y', 'que', 'trata', 'de', 'incentivar', 'el', 'mercado', 'donde', 'no', 'lo', 'hay'],
[0, 2, 0, 4, 2, 4, 8, 8, 2, 10, 8, 12, 10, 16, 16, 16, 0],
['ROOT', 'mark', 'ccomp', 'det', 'obj', 'amod', 'cc', 'mark', 'conj', 'mark', 'xcomp', 'det', 'obj', 'obl', 'advmod', 'obj', 'advcl'],
['VERB', 'SCONJ', 'VERB', 'DET', 'NOUN', 'ADJ', 'CCONJ', 'SCONJ', 'VERB', 'ADP', 'VERB', 'DET', 'NOUN', 'PRON', 'ADV', 'PRON', 'AUX'],
[(3,6), (11,13), (13,14), (15,16)]
),
# En este sentido se refirió a la reciente creación del Ministerio de Ciencia y Tecnología y a las primeras declaraciones de su titular, Anna Birulés, sobre el impulso de la investigación, desarrollo e innovación -> este sentido, se, la reciente creación, Ministerio de Ciencia y Tecnología, a las primeras declaraciones, su titular, , Anna Birulés,, el impulso, la investigación, , desarrollo, innovación
(
['En', 'este', 'sentido', 'se', 'refirió', 'a', 'la', 'reciente', 'creación', 'del', 'Ministerio', 'de', 'Ciencia', 'y', 'Tecnología', 'y', 'a', 'las', 'primeras', 'declaraciones', 'de', 'su', 'titular', ',', 'Anna', 'Birulés', ',', 'sobre', 'el', 'impulso', 'de', 'la', 'investigación', ',', 'desarrollo', 'e', 'innovación'],
[2, 2, 4, 4, 4, 8, 8, 8, 4, 10, 8, 12, 10, 14, 12, 19, 19, 19, 19, 8, 22, 22, 19, 24, 22, 24, 24, 29, 29, 19, 32, 32, 29, 34, 32, 36, 32],
['case', 'det', 'obl', 'obj', 'ROOT', 'case', 'det', 'amod', 'obj', 'case', 'nmod', 'case', 'flat', 'cc', 'conj', 'cc', 'case', 'det', 'amod', 'conj', 'case', 'det', 'nmod', 'punct', 'appos', 'flat', 'punct', 'case', 'det', 'nmod', 'case', 'det', 'nmod', 'punct', 'conj', 'cc', 'conj'],
['ADP', 'DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'ADP', 'PROPN', 'CCONJ', 'PROPN', 'CCONJ', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADP', 'DET', 'NOUN', 'PUNCT', 'PROPN', 'PROPN', 'PUNCT', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'PUNCT', 'NOUN', 'CCONJ', 'NOUN'],
[(1, 3), (3, 4), (6, 9), (10, 15), (16, 20), (21, 23), (23, 27), (28, 30), (31, 33), (33, 35), (36, 37)]
),
# Asimismo defiende la financiación pública de la investigación básica y pone de manifiesto que las empresas se centran más en la investigación y desarrollo con objetivos de mercado. -> la financiación pública, la investigación básica, manifiesto, las empresas, se, la investigación, desarrollo, objetivos, mercado
(
['Asimismo', 'defiende', 'la', 'financiación', 'pública', 'de', 'la', 'investigación', 'básica', 'y', 'pone', 'de', 'manifiesto', 'que', 'las', 'empresas', 'se', 'centran', 'más', 'en', 'la', 'investigación', 'y', 'desarrollo', 'con', 'objetivos', 'de', 'mercado'],
[1, 1, 3, 1, 3, 7, 7, 3, 7, 10, 1, 12, 10, 17, 15, 17, 17, 10, 17, 21, 21, 17, 23, 21, 25, 17, 27, 25],
['advmod', 'ROOT', 'det', 'obj', 'amod', 'case', 'det', 'nmod', 'amod', 'cc', 'conj', 'case', 'obl', 'mark', 'det', 'nsubj', 'obj', 'ccomp', 'obj', 'case', 'det', 'obl', 'cc', 'conj', 'case', 'obl', 'case', 'nmod'],
['ADV', 'VERB', 'DET', 'NOUN', 'ADJ', 'ADP', 'DET', 'NOUN', 'ADJ', 'CCONJ', 'VERB', 'ADP', 'NOUN', 'SCONJ', 'DET', 'NOUN', 'PRON', 'VERB', 'ADV', 'ADP', 'DET', 'NOUN', 'CCONJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
[(2, 5), (6, 9), (12, 13), (14, 16), (16, 17), (20, 22), (23, 24), (25, 26), (27, 28)]
),
# Tras indicar que la inversión media en investigación en la Unión Europea se sitúa en el 1,8 por ciento del PIB, frente al 2,8 por ciento en Japón y EEUU, Couceiro dijo que España está en "el buen camino" y se está creando un entorno propicio para la innovación empresarial' -> la inversión media, investigación, la Unión Europea, se, PIB, Japón, EEUU, Couceiro, España, se, un entorno propicio para la innovación empresaria
(
['Tras', 'indicar', 'que', 'la', 'inversión', 'media', 'en', 'investigación', 'en', 'la', 'Unión', 'Europea', 'se', 'sitúa', 'en', 'el', '1,8', 'por', 'ciento', 'del', 'PIB', ',', 'frente', 'al', '2,8', 'por', 'ciento', 'en', 'Japón', 'y', 'EEUU', ',', 'Couceiro', 'dijo', 'que', 'España', 'está', 'en', '"', 'el', 'buen', 'camino', '"', 'y', 'se', 'está', 'creando', 'un', 'entorno', 'propicio', 'para', 'la', 'innovación', 'empresarial'],
[1, 33, 13, 4, 13, 4, 7, 4, 10, 10, 4, 10, 13, 1, 16, 16, 13, 18, 16, 20, 16, 24, 24, 22, 13, 26, 24, 28, 24, 30, 28, 1, 33, 33, 41, 41, 41, 41, 41, 41, 41, 33, 41, 46, 46, 46, 33, 48, 46, 48, 52, 52, 49, 52],
['mark', 'advcl', 'mark', 'det', 'nsubj', 'amod', 'case', 'nmod', 'case', 'det', 'nmod', 'flat', 'obj', 'ccomp', 'case', 'det', 'obj', 'case', 'compound', 'case', 'nmod', 'punct', 'case', 'fixed', 'obl', 'case', 'compound', 'case', 'nmod', 'cc', 'conj', 'punct', 'nsubj', 'ROOT', 'mark', 'nsubj', 'cop', 'case', 'punct', 'det', 'amod', 'ccomp', 'punct', 'cc', 'obj', 'aux', 'conj', 'det', 'nsubj', 'amod', 'case', 'det', 'nmod', 'amod'],
['ADP', 'VERB', 'SCONJ', 'DET', 'NOUN', 'ADJ', 'ADP', 'NOUN', 'ADP', 'DET', 'PROPN', 'PROPN', 'PRON', 'VERB', 'ADP', 'DET', 'NUM', 'ADP', 'NUM', 'ADP', 'PROPN', 'PUNCT', 'NOUN', 'ADP', 'NUM', 'ADP', 'NUM', 'ADP', 'PROPN', 'CCONJ', 'PROPN', 'PUNCT', 'PROPN', 'VERB', 'SCONJ', 'PROPN', 'AUX', 'ADP', 'PUNCT', 'DET', 'ADJ', 'NOUN', 'PUNCT', 'CCONJ', 'PRON', 'AUX', 'VERB', 'DET', 'NOUN', 'ADJ', 'ADP', 'DET', 'NOUN', 'ADJ'],
[(3, 6), (7, 8), (9, 12), (12, 13), (20, 21), (28, 29), (30, 31), (32, 33), (35, 36), (44, 45), (47, 54)]
),
],
)
# fmt: on
def test_es_noun_chunks(es_vocab, words, heads, deps, pos, chunk_offsets):
doc = Doc(es_vocab, words=words, heads=heads, deps=deps, pos=pos)
assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
def test_noun_chunks_is_parsed_es(es_tokenizer):
"""Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed."""
doc = es_tokenizer("en Oxford este verano")

View File

@ -4,6 +4,7 @@ from spacy.lang.punctuation import TOKENIZER_INFIXES
from spacy.lang.char_classes import ALPHA
@pytest.mark.issue(768)
@pytest.mark.parametrize(
"text,expected_tokens", [("l'avion", ["l'", "avion"]), ("j'ai", ["j'", "ai"])]
)

View File

@ -133,11 +133,7 @@ def test_ja_tokenizer_sub_tokens(
(["五段-ラ行;連用形-促音便"], [], ["下一段-カ行;連用形-一般"], ["助動詞-タ;終止形-一般"]),
(["トッ"], [""], ["ツケ"], [""]),
),
(
"2=3",
([], [], []),
([""], ["_"], ["サン"])
),
("2=3", ([], [], []), ([""], ["_"], ["サン"])),
],
)
def test_ja_tokenizer_inflections_reading_forms(

View File

@ -0,0 +1,221 @@
from spacy.tokens import Doc
import pytest
# fmt: off
@pytest.mark.parametrize(
"words,heads,deps,pos,chunk_offsets",
[
# determiner + noun
# um cachorro -> um cachorro
(
["um", "cachorro"],
[1, 1],
["det", "ROOT"],
["DET", "NOUN"],
[(0, 2)],
),
# two determiners + noun
# meu o pai -> meu o pai
(
["meu", "o", "pai"],
[2, 2, 2],
["det", "det", "ROOT"],
["DET", "DET", "NOUN"],
[(0, 3)],
),
# two determiners + noun
# todos essos caros -> todos essos caros
(
["todos", "essos", "caros"],
[2, 2, 2],
["det", "det", "ROOT"],
["DET", "DET", "NOUN"],
[(0, 3)],
),
# two determiners, one is after noun
# um irmão meu -> um irmão meu
(
["um", "irmão", "meu"],
[1, 1, 1],
["det", "ROOT", "det"],
["DET", "NOUN", "DET"],
[(0, 3)],
),
# two determiners + noun
# o meu pai -> o meu pai
(
["o", "meu", "pai"],
[2, 2, 2],
["det","det", "ROOT"],
["DET", "DET", "NOUN"],
[(0, 3)],
),
# relative pronoun
# A bicicleta essa está estragada -> A bicicleta
(
['A', 'bicicleta', 'essa', 'está', 'estragada'],
[1, 4, 1, 4, 4],
['det', 'nsubj', 'det', 'cop', 'ROOT'],
['DET', 'NOUN', 'PRON', 'AUX', 'ADJ'],
[(0,2)]
),
# relative subclause
# o computador que comprou -> o computador
(
['o', 'computador', 'que', 'comprou'],
[1, 1, 3, 1],
['det', 'ROOT', 'nsubj', 'acl:relcl'],
['DET', 'NOUN', 'PRON', 'VERB'],
[(0, 2), (2, 3)]
),
# det + noun + adj
# O cachorro marrom -> O cachorro marrom
(
["O", "cachorro", "marrom"],
[1, 1, 1],
["det", "ROOT", "amod"],
["DET", "NOUN", "ADJ"],
[(0, 3)],
),
# det + noun + adj plural
# As calças baratas -> As calças baratas
(
["As", "calças", "baratas"],
[1, 1, 1],
["det", "ROOT", "amod"],
["DET", "NOUN", "ADJ"],
[(0, 3)],
),
# det + adj + noun
# Uma boa ideia -> Uma boa ideia
(
['uma', 'boa', 'ideia'],
[2, 2, 2],
["det", "amod", "ROOT"],
["DET", "ADJ", "NOUN"],
[(0,3)]
),
# multiple adjectives
# Uma garota esperta e inteligente -> Uma garota esperta e inteligente
(
["Uma", "garota", "esperta", "e", "inteligente"],
[1, 1, 1, 4, 2],
["det", "ROOT", "amod", "cc", "conj"],
["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
[(0,5)]
),
# determiner, adjective, compound created by flat
# a grande São Paolo -> a grande São Paolo
(
["a", "grande", "São", "Paolo"],
[2, 2, 2, 2],
["det", "amod", "ROOT", "flat:name"],
["DET", "ADJ", "PROPN", "PROPN"],
[(0,4)]
),
# one determiner + one noun + one adjective qualified by an adverb
# alguns fazendeiros muito ricos -> alguns fazendeiros muito ricos
(
['alguns', 'fazendeiros', 'muito', 'ricos'],
[1, 1, 3, 1],
['det', 'ROOT', 'advmod', 'amod'],
['DET', 'NOUN', 'ADV', 'ADJ'],
[(0,4)]
),
# Two NPs conjuncted
# Eu tenho um cachorro e um gato -> Eu, um cacharo, um gato
(
["Eu", "tenho", "um", "cachorro", "e", "um", "gato"],
[1, 1, 3, 1, 6, 6, 3],
['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
[(0,1), (2,4), (5,7)]
),
# Two NPs together
# o escritor brasileiro Aníbal Machado -> o escritor brasileiro, Aníbal Machado
(
['o', 'escritor', 'brasileiro', 'Aníbal', 'Machado'],
[1, 1, 1, 1, 3],
['det', 'ROOT', 'amod', 'appos', 'flat:name'],
['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
[(0, 3), (3, 5)]
),
# Noun compound, person name and titles
# Dom Pedro II -> Dom Pedro II
(
["Dom", "Pedro", "II"],
[0, 0, 0],
["ROOT", "flat:name", "flat:name"],
["PROPN", "PROPN", "PROPN"],
[(0,3)]
),
# Noun compound created by flat
# os Estados Unidos -> os Estados Unidos
(
["os", "Estados", "Unidos"],
[1, 1, 1],
["det", "ROOT", "flat:name"],
["DET", "PROPN", "PROPN"],
[(0,3)]
),
# nmod relation between NPs
# a destruição da cidade -> a destruição, cidade
(
['a', 'destruição', 'da', 'cidade'],
[1, 1, 3, 1],
['det', 'ROOT', 'case', 'nmod'],
['DET', 'NOUN', 'ADP', 'NOUN'],
[(0,2), (3,4)]
),
# Compounding by nmod, several NPs chained together
# a primeira fábrica de medicamentos do governo -> a primeira fábrica, medicamentos, governo
(
["a", "primeira", "fábrica", "de", "medicamentos", "do", "governo"],
[2, 2, 2, 4, 2, 6, 2],
['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
[(0, 3), (4, 5), (6, 7)]
),
# several NPs
# Tradução da reportagem de Susana -> Tradução, reportagem, Susana
(
['Tradução', 'da', 'reportagem', 'de', 'Susana'],
[0, 2, 0, 4, 2],
['ROOT', 'case', 'nmod', 'case', 'nmod'],
['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
[(0,1), (2,3), (4,5)]
),
# Several NPs
# O gato gordo da Susana e seu amigo -> O gato gordo, Susana, seu amigo
(
['O', 'gato', 'gordo', 'da', 'Susana', 'e', 'seu', 'amigo'],
[1, 1, 1, 4, 1, 7, 7, 1],
['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'conj'],
['DET', 'NOUN', 'ADJ', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'],
[(0,3), (4,5), (6,8)]
),
# Passive subject
# Os novos gastos são alimentados pela grande conta bancária de Clinton -> Os novos gastos, grande conta bancária, Clinton
(
['Os', 'novos', 'gastos', 'são', 'alimentados', 'pela', 'grande', 'conta', 'bancária', 'de', 'Clinton'],
[2, 2, 4, 4, 4, 7, 7, 4, 7, 10, 7],
['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'amod', 'obl:agent', 'amod', 'case', 'nmod'],
['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'ADJ', 'NOUN', 'ADJ', 'ADP', 'PROPN'],
[(0, 3), (6, 9), (10, 11)]
)
],
)
# fmt: on
def test_pt_noun_chunks(pt_vocab, words, heads, deps, pos, chunk_offsets):
doc = Doc(pt_vocab, words=words, heads=heads, deps=deps, pos=pos)
assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
def test_noun_chunks_is_parsed_pt(pt_tokenizer):
"""Test that noun_chunks raises Value Error for 'pt' language if Doc is not parsed."""
doc = pt_tokenizer("en Oxford este verano")
with pytest.raises(ValueError):
list(doc.noun_chunks)

View File

View File

@ -0,0 +1,8 @@
def test_tl_simple_punct(tl_tokenizer):
text = "Sige, punta ka dito"
tokens = tl_tokenizer(text)
assert tokens[0].idx == 0
assert tokens[1].idx == 4
assert tokens[2].idx == 6
assert tokens[3].idx == 12
assert tokens[4].idx == 15

View File

@ -0,0 +1,127 @@
import pytest
from spacy.util import compile_prefix_regex
from spacy.lang.punctuation import TOKENIZER_PREFIXES
PUNCT_OPEN = ["(", "[", "{", "*"]
PUNCT_CLOSE = [")", "]", "}", "*"]
PUNCT_PAIRED = [("(", ")"), ("[", "]"), ("{", "}"), ("*", "*")]
@pytest.mark.parametrize("text", ["(", "((", "<"])
def test_tl_tokenizer_handles_only_punct(tl_tokenizer, text):
tokens = tl_tokenizer(text)
assert len(tokens) == len(text)
@pytest.mark.parametrize("punct", PUNCT_OPEN)
@pytest.mark.parametrize("text", ["Mabuhay"])
def test_tl_tokenizer_split_open_punct(tl_tokenizer, punct, text):
tokens = tl_tokenizer(punct + text)
assert len(tokens) == 2
assert tokens[0].text == punct
assert tokens[1].text == text
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
@pytest.mark.parametrize("text", ["Mabuhay"])
def test_tl_tokenizer_splits_close_punct(tl_tokenizer, punct, text):
tokens = tl_tokenizer(text + punct)
assert len(tokens) == 2
assert tokens[0].text == text
assert tokens[1].text == punct
@pytest.mark.parametrize("punct", PUNCT_OPEN)
@pytest.mark.parametrize("punct_add", ["`"])
@pytest.mark.parametrize("text", ["Mabuhay"])
def test_tl_tokenizer_splits_two_diff_open_punct(tl_tokenizer, punct, punct_add, text):
tokens = tl_tokenizer(punct + punct_add + text)
assert len(tokens) == 3
assert tokens[0].text == punct
assert tokens[1].text == punct_add
assert tokens[2].text == text
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
@pytest.mark.parametrize("punct_add", ["`"])
@pytest.mark.parametrize("text", ["Mabuhay"])
def test_tl_tokenizer_splits_two_diff_close_punct(tl_tokenizer, punct, punct_add, text):
tokens = tl_tokenizer(text + punct + punct_add)
assert len(tokens) == 3
assert tokens[0].text == text
assert tokens[1].text == punct
assert tokens[2].text == punct_add
@pytest.mark.parametrize("punct", PUNCT_OPEN)
@pytest.mark.parametrize("text", ["Mabuhay"])
def test_tl_tokenizer_splits_same_open_punct(tl_tokenizer, punct, text):
tokens = tl_tokenizer(punct + punct + punct + text)
assert len(tokens) == 4
assert tokens[0].text == punct
assert tokens[3].text == text
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
@pytest.mark.parametrize("text", ["Mabuhay"])
def test_tl_tokenizer_splits_same_close_punct(tl_tokenizer, punct, text):
tokens = tl_tokenizer(text + punct + punct + punct)
assert len(tokens) == 4
assert tokens[0].text == text
assert tokens[1].text == punct
@pytest.mark.parametrize("text", ["'Ang"])
def test_tl_tokenizer_splits_open_apostrophe(tl_tokenizer, text):
tokens = tl_tokenizer(text)
assert len(tokens) == 2
assert tokens[0].text == "'"
@pytest.mark.parametrize("text", ["Mabuhay''"])
def test_tl_tokenizer_splits_double_end_quote(tl_tokenizer, text):
tokens = tl_tokenizer(text)
assert len(tokens) == 2
tokens_punct = tl_tokenizer("''")
assert len(tokens_punct) == 1
@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED)
@pytest.mark.parametrize("text", ["Mabuhay"])
def test_tl_tokenizer_splits_open_close_punct(
tl_tokenizer, punct_open, punct_close, text
):
tokens = tl_tokenizer(punct_open + text + punct_close)
assert len(tokens) == 3
assert tokens[0].text == punct_open
assert tokens[1].text == text
assert tokens[2].text == punct_close
@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED)
@pytest.mark.parametrize("punct_open2,punct_close2", [("`", "'")])
@pytest.mark.parametrize("text", ["Mabuhay"])
def test_tl_tokenizer_two_diff_punct(
tl_tokenizer, punct_open, punct_close, punct_open2, punct_close2, text
):
tokens = tl_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2)
assert len(tokens) == 5
assert tokens[0].text == punct_open2
assert tokens[1].text == punct_open
assert tokens[2].text == text
assert tokens[3].text == punct_close
assert tokens[4].text == punct_close2
@pytest.mark.parametrize("text,punct", [("(sa'yo", "(")])
def test_tl_tokenizer_splits_pre_punct_regex(text, punct):
tl_search_prefixes = compile_prefix_regex(TOKENIZER_PREFIXES).search
match = tl_search_prefixes(text)
assert match.group() == punct
def test_tl_tokenizer_splits_bracket_period(tl_tokenizer):
text = "(Dumating siya kahapon)."
tokens = tl_tokenizer(text)
assert tokens[len(tokens) - 1].text == "."

View File

@ -0,0 +1,73 @@
import pytest
from spacy.lang.tl.lex_attrs import like_num
# https://github.com/explosion/spaCy/blob/master/spacy/tests/lang/en/test_text.py
def test_tl_tokenizer_handles_long_text(tl_tokenizer):
# Excerpt: "Sapagkat ang Pilosopiya ay Ginagawa" by Padre Roque Ferriols
text = """
Tingin tayo nang tingin. Kailangan lamang nating dumilat at
marami tayong makikita. At ang pagtingin ay isang gawain na ako lamang ang
makagagawa, kung ako nga ang makakita. Kahit na napanood na ng aking
matalik na kaibigan ang isang sine, kailangan ko pa ring panoorin, kung
ako nga ang may gustong makakita. Kahit na gaano kadikit ang aming
pagkabuklod, hindi siya maaaring tumingin sa isang paraan na ako ang
nakakakita. Kung ako ang makakita, ako lamang ang makatitingin.
"""
tokens = tl_tokenizer(text)
assert len(tokens) == 97
@pytest.mark.parametrize(
"text,length",
[
("Huwag mo nang itanong sa akin.", 7),
("Nasubukan mo na bang hulihin ang hangin?", 8),
("Hindi ba?", 3),
("Nagbukas ang DFA ng 1,000 appointment slots para sa pasaporte.", 11),
("'Wala raw pasok bukas kasi may bagyo!' sabi ni Micah.", 14),
("'Ingat,' aniya. 'Maingay sila pag malayo at tahimik kung malapit.'", 17),
],
)
def test_tl_tokenizer_handles_cnts(tl_tokenizer, text, length):
tokens = tl_tokenizer(text)
assert len(tokens) == length
@pytest.mark.parametrize(
"text,match",
[
("10", True),
("isa", True),
("dalawa", True),
("tatlumpu", True),
pytest.param(
"isang daan",
True,
marks=pytest.mark.xfail(reason="Not yet implemented (means 100)"),
),
pytest.param(
"kalahati",
True,
marks=pytest.mark.xfail(reason="Not yet implemented (means 1/2)"),
),
pytest.param(
"isa't kalahati",
True,
marks=pytest.mark.xfail(
reason="Not yet implemented (means one-and-a-half)"
),
),
],
)
def test_lex_attrs_like_number(tl_tokenizer, text, match):
tokens = tl_tokenizer(text)
assert all([token.like_num for token in tokens]) == match
@pytest.mark.xfail(reason="Not yet implemented, fails when capitalized.")
@pytest.mark.parametrize("word", ["isa", "dalawa", "tatlo"])
def test_tl_lex_attrs_capitals(word):
assert like_num(word)
assert like_num(word.upper())

View File

@ -370,6 +370,7 @@ def test_dependency_matcher_span_user_data(en_tokenizer):
assert doc_t_i == span_t_i + offset
@pytest.mark.issue(9263)
def test_dependency_matcher_order_issue(en_tokenizer):
# issue from #9263
doc = en_tokenizer("I like text")
@ -415,6 +416,7 @@ def test_dependency_matcher_order_issue(en_tokenizer):
assert matches == []
@pytest.mark.issue(9263)
def test_dependency_matcher_remove(en_tokenizer):
# issue from #9263
doc = en_tokenizer("The red book")

View File

@ -152,6 +152,7 @@ def test_operator_combos(en_vocab):
assert not matches, (string, pattern_str)
@pytest.mark.issue(1450)
def test_matcher_end_zero_plus(en_vocab):
"""Test matcher works when patterns end with * operator. (issue 1450)"""
matcher = Matcher(en_vocab)

View File

@ -12,6 +12,7 @@ from spacy.tokens import Doc, Span
from ..util import make_tempdir
@pytest.mark.issue(118)
@pytest.mark.parametrize(
"patterns",
[
@ -39,6 +40,7 @@ def test_issue118(en_tokenizer, patterns):
assert ents[0].end == 11
@pytest.mark.issue(118)
@pytest.mark.parametrize(
"patterns",
[
@ -66,6 +68,7 @@ def test_issue118_prefix_reorder(en_tokenizer, patterns):
assert ents[0].end == 11
@pytest.mark.issue(242)
def test_issue242(en_tokenizer):
"""Test overlapping multi-word phrases."""
text = "There are different food safety standards in different countries."
@ -88,6 +91,7 @@ def test_issue242(en_tokenizer):
doc.ents += tuple(matches)
@pytest.mark.issue(309)
def test_issue309(en_vocab):
"""Test Issue #309: SBD fails on empty string"""
doc = Doc(en_vocab, words=[" "], heads=[0], deps=["ROOT"])
@ -96,6 +100,7 @@ def test_issue309(en_vocab):
assert len(sents) == 1
@pytest.mark.issue(351)
def test_issue351(en_tokenizer):
doc = en_tokenizer(" This is a cat.")
assert doc[0].idx == 0
@ -103,12 +108,14 @@ def test_issue351(en_tokenizer):
assert doc[1].idx == 3
@pytest.mark.issue(360)
def test_issue360(en_tokenizer):
"""Test tokenization of big ellipsis"""
tokens = en_tokenizer("$45...............Asking")
assert len(tokens) > 2
@pytest.mark.issue(361)
@pytest.mark.parametrize("text1,text2", [("cat", "dog")])
def test_issue361(en_vocab, text1, text2):
"""Test Issue #361: Equality of lexemes"""
@ -116,6 +123,7 @@ def test_issue361(en_vocab, text1, text2):
assert en_vocab[text1] != en_vocab[text2]
@pytest.mark.issue(587)
def test_issue587(en_tokenizer):
"""Test that Matcher doesn't segfault on particular input"""
doc = en_tokenizer("a b; c")
@ -131,12 +139,14 @@ def test_issue587(en_tokenizer):
assert len(matches) == 2
@pytest.mark.issue(588)
def test_issue588(en_vocab):
matcher = Matcher(en_vocab)
with pytest.raises(ValueError):
matcher.add("TEST", [[]])
@pytest.mark.issue(590)
def test_issue590(en_vocab):
"""Test overlapping matches"""
doc = Doc(en_vocab, words=["n", "=", "1", ";", "a", ":", "5", "%"])
@ -149,6 +159,7 @@ def test_issue590(en_vocab):
assert len(matches) == 2
@pytest.mark.issue(595)
@pytest.mark.skip(reason="Old vocab-based lemmatization")
def test_issue595():
"""Test lemmatization of base forms"""
@ -164,6 +175,7 @@ def test_issue595():
assert doc[2].lemma_ == "feed"
@pytest.mark.issue(599)
def test_issue599(en_vocab):
doc = Doc(en_vocab)
doc2 = Doc(doc.vocab)
@ -171,12 +183,14 @@ def test_issue599(en_vocab):
assert doc2.has_annotation("DEP")
@pytest.mark.issue(600)
def test_issue600():
vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}})
doc = Doc(vocab, words=["hello"])
doc[0].tag_ = "NN"
@pytest.mark.issue(615)
def test_issue615(en_tokenizer):
def merge_phrases(matcher, doc, i, matches):
"""Merge a phrase. We have to be careful here because we'll change the
@ -204,6 +218,7 @@ def test_issue615(en_tokenizer):
assert entities[0].label != 0
@pytest.mark.issue(736)
@pytest.mark.parametrize("text,number", [("7am", "7"), ("11p.m.", "11")])
def test_issue736(en_tokenizer, text, number):
"""Test that times like "7am" are tokenized correctly and that numbers are
@ -213,6 +228,7 @@ def test_issue736(en_tokenizer, text, number):
assert tokens[0].text == number
@pytest.mark.issue(740)
@pytest.mark.parametrize("text", ["3/4/2012", "01/12/1900"])
def test_issue740(en_tokenizer, text):
"""Test that dates are not split and kept as one token. This behaviour is
@ -222,6 +238,7 @@ def test_issue740(en_tokenizer, text):
assert len(tokens) == 1
@pytest.mark.issue(743)
def test_issue743():
doc = Doc(Vocab(), ["hello", "world"])
token = doc[0]
@ -230,6 +247,7 @@ def test_issue743():
assert items[0] is token
@pytest.mark.issue(744)
@pytest.mark.parametrize("text", ["We were scared", "We Were Scared"])
def test_issue744(en_tokenizer, text):
"""Test that 'were' and 'Were' are excluded from the contractions
@ -239,6 +257,7 @@ def test_issue744(en_tokenizer, text):
assert tokens[1].text.lower() == "were"
@pytest.mark.issue(759)
@pytest.mark.parametrize(
"text,is_num", [("one", True), ("ten", True), ("teneleven", False)]
)
@ -247,6 +266,7 @@ def test_issue759(en_tokenizer, text, is_num):
assert tokens[0].like_num == is_num
@pytest.mark.issue(775)
@pytest.mark.parametrize("text", ["Shell", "shell", "Shed", "shed"])
def test_issue775(en_tokenizer, text):
"""Test that 'Shell' and 'shell' are excluded from the contractions
@ -256,6 +276,7 @@ def test_issue775(en_tokenizer, text):
assert tokens[0].text == text
@pytest.mark.issue(792)
@pytest.mark.parametrize("text", ["This is a string ", "This is a string\u0020"])
def test_issue792(en_tokenizer, text):
"""Test for Issue #792: Trailing whitespace is removed after tokenization."""
@ -263,6 +284,7 @@ def test_issue792(en_tokenizer, text):
assert "".join([token.text_with_ws for token in doc]) == text
@pytest.mark.issue(792)
@pytest.mark.parametrize("text", ["This is a string", "This is a string\n"])
def test_control_issue792(en_tokenizer, text):
"""Test base case for Issue #792: Non-trailing whitespace"""
@ -270,6 +292,7 @@ def test_control_issue792(en_tokenizer, text):
assert "".join([token.text_with_ws for token in doc]) == text
@pytest.mark.issue(801)
@pytest.mark.skip(
reason="Can not be fixed unless with variable-width lookbehinds, cf. PR #3218"
)
@ -292,6 +315,7 @@ def test_issue801(en_tokenizer, text, tokens):
assert [t.text for t in doc] == tokens
@pytest.mark.issue(805)
@pytest.mark.parametrize(
"text,expected_tokens",
[
@ -311,6 +335,7 @@ def test_issue805(sv_tokenizer, text, expected_tokens):
assert expected_tokens == token_list
@pytest.mark.issue(850)
def test_issue850():
"""The variable-length pattern matches the succeeding token. Check we
handle the ambiguity correctly."""
@ -326,6 +351,7 @@ def test_issue850():
assert end == 4
@pytest.mark.issue(850)
def test_issue850_basic():
"""Test Matcher matches with '*' operator and Boolean flag"""
vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
@ -340,6 +366,7 @@ def test_issue850_basic():
assert end == 4
@pytest.mark.issue(852)
@pytest.mark.skip(
reason="French exception list is not enabled in the default tokenizer anymore"
)
@ -352,6 +379,7 @@ def test_issue852(fr_tokenizer, text):
assert len(tokens) == 1
@pytest.mark.issue(859)
@pytest.mark.parametrize(
"text", ["aaabbb@ccc.com\nThank you!", "aaabbb@ccc.com \nThank you!"]
)
@ -361,6 +389,7 @@ def test_issue859(en_tokenizer, text):
assert doc.text == text
@pytest.mark.issue(886)
@pytest.mark.parametrize("text", ["Datum:2014-06-02\nDokument:76467"])
def test_issue886(en_tokenizer, text):
"""Test that token.idx matches the original text index for texts with newlines."""
@ -370,6 +399,7 @@ def test_issue886(en_tokenizer, text):
assert text[token.idx] == token.text[0]
@pytest.mark.issue(891)
@pytest.mark.parametrize("text", ["want/need"])
def test_issue891(en_tokenizer, text):
"""Test that / infixes are split correctly."""
@ -378,6 +408,7 @@ def test_issue891(en_tokenizer, text):
assert tokens[1].text == "/"
@pytest.mark.issue(912)
@pytest.mark.skip(reason="Old vocab-based lemmatization")
@pytest.mark.parametrize(
"text,tag,lemma",
@ -390,6 +421,7 @@ def test_issue912(en_vocab, text, tag, lemma):
assert doc[0].lemma_ == lemma
@pytest.mark.issue(957)
@pytest.mark.slow
def test_issue957(en_tokenizer):
"""Test that spaCy doesn't hang on many punctuation characters.
@ -405,6 +437,7 @@ def test_issue957(en_tokenizer):
assert doc
@pytest.mark.issue(999)
def test_issue999():
"""Test that adding entities and resuming training works passably OK.
There are two issues here:

View File

@ -9,6 +9,7 @@ from spacy.tokenizer import Tokenizer
from spacy.symbols import ORTH, LEMMA, POS
@pytest.mark.issue(1061)
def test_issue1061():
"""Test special-case works after tokenizing. Was caching problem."""
text = "I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_."
@ -33,6 +34,7 @@ def test_issue1061():
@pytest.mark.skip(
reason="Can not be fixed without variable-width look-behind (which we don't want)"
)
@pytest.mark.issue(1235)
def test_issue1235():
"""Test that g is not split of if preceded by a number and a letter"""
nlp = English()
@ -46,6 +48,7 @@ def test_issue1235():
assert doc[4].text == "g"
@pytest.mark.issue(1242)
def test_issue1242():
nlp = English()
doc = nlp("")
@ -56,6 +59,7 @@ def test_issue1242():
@pytest.mark.skip(reason="v3 no longer supports LEMMA/POS in tokenizer special cases")
@pytest.mark.issue(1250)
def test_issue1250():
"""Test cached special cases."""
special_case = [{ORTH: "reimbur", LEMMA: "reimburse", POS: "VERB"}]
@ -67,6 +71,7 @@ def test_issue1250():
assert lemmas == ["reimburse", ",", "reimburse", "..."]
@pytest.mark.issue(1257)
def test_issue1257():
"""Test that tokens compare correctly."""
doc1 = Doc(Vocab(), words=["a", "b", "c"])
@ -75,6 +80,7 @@ def test_issue1257():
assert not doc1[0] == doc2[0]
@pytest.mark.issue(1375)
def test_issue1375():
"""Test that token.nbor() raises IndexError for out-of-bounds access."""
doc = Doc(Vocab(), words=["0", "1", "2"])
@ -86,6 +92,7 @@ def test_issue1375():
assert doc[1].nbor(1).text == "2"
@pytest.mark.issue(1434)
def test_issue1434():
"""Test matches occur when optional element at end of short doc."""
pattern = [{"ORTH": "Hello"}, {"IS_ALPHA": True, "OP": "?"}]
@ -111,6 +118,7 @@ def test_issue1434():
("a b b", 0, 3),
],
)
@pytest.mark.issue(1450)
def test_issue1450(string, start, end):
"""Test matcher works when patterns end with * operator."""
pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}]
@ -124,6 +132,7 @@ def test_issue1450(string, start, end):
assert matches[-1][2] == end
@pytest.mark.issue(1488)
def test_issue1488():
prefix_re = re.compile(r"""[\[\("']""")
suffix_re = re.compile(r"""[\]\)"']""")
@ -147,6 +156,7 @@ def test_issue1488():
assert token.text
@pytest.mark.issue(1494)
def test_issue1494():
infix_re = re.compile(r"""[^a-z]""")
test_cases = [

View File

@ -17,6 +17,7 @@ from spacy.matcher import Matcher
from ..util import make_tempdir
@pytest.mark.issue(1506)
def test_issue1506():
def string_generator():
for _ in range(10001):
@ -40,6 +41,7 @@ def test_issue1506():
str(t.lemma_)
@pytest.mark.issue(1518)
def test_issue1518():
"""Test vectors.resize() works."""
vectors = Vectors(shape=(10, 10))
@ -47,6 +49,7 @@ def test_issue1518():
vectors.resize((5, 9))
@pytest.mark.issue(1537)
def test_issue1537():
"""Test that Span.as_doc() doesn't segfault."""
string = "The sky is blue . The man is pink . The dog is purple ."
@ -65,6 +68,7 @@ def test_issue1537():
# TODO: Currently segfaulting, due to l_edge and r_edge misalignment
@pytest.mark.issue(1537)
# def test_issue1537_model():
# nlp = load_spacy('en')
# doc = nlp('The sky is blue. The man is pink. The dog is purple.')
@ -73,12 +77,14 @@ def test_issue1537():
# print(list(sents[1].noun_chunks))
@pytest.mark.issue(1539)
def test_issue1539():
"""Ensure vectors.resize() doesn't try to modify dictionary during iteration."""
v = Vectors(shape=(10, 10), keys=[5, 3, 98, 100])
v.resize((100, 100))
@pytest.mark.issue(1547)
def test_issue1547():
"""Test that entity labels still match after merging tokens."""
words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"]
@ -89,12 +95,14 @@ def test_issue1547():
assert [ent.text for ent in doc.ents]
@pytest.mark.issue(1612)
def test_issue1612(en_tokenizer):
doc = en_tokenizer("The black cat purrs.")
span = doc[1:3]
assert span.orth_ == span.text
@pytest.mark.issue(1654)
def test_issue1654():
nlp = Language(Vocab())
assert not nlp.pipeline
@ -116,12 +124,14 @@ def test_issue1654():
@pytest.mark.parametrize("text", ["test@example.com", "john.doe@example.co.uk"])
@pytest.mark.issue(1698)
def test_issue1698(en_tokenizer, text):
doc = en_tokenizer(text)
assert len(doc) == 1
assert not doc[0].like_url
@pytest.mark.issue(1727)
def test_issue1727():
"""Test that models with no pretrained vectors can be deserialized
correctly after vectors are added."""
@ -138,6 +148,7 @@ def test_issue1727():
assert tagger.cfg.get("pretrained_dims", 0) == 0
@pytest.mark.issue(1757)
def test_issue1757():
"""Test comparison against None doesn't cause segfault."""
doc = Doc(Vocab(), words=["a", "b", "c"])
@ -151,12 +162,14 @@ def test_issue1757():
assert not doc.vocab["a"] < None
@pytest.mark.issue(1758)
def test_issue1758(en_tokenizer):
"""Test that "would've" is handled by the English tokenizer exceptions."""
tokens = en_tokenizer("would've")
assert len(tokens) == 2
@pytest.mark.issue(1773)
def test_issue1773(en_tokenizer):
"""Test that spaces don't receive a POS but no TAG. This is the root cause
of the serialization issue reported in #1773."""
@ -165,6 +178,7 @@ def test_issue1773(en_tokenizer):
assert doc[0].tag_ != ""
@pytest.mark.issue(1799)
def test_issue1799():
"""Test sentence boundaries are deserialized correctly, even for
non-projective sentences."""
@ -186,6 +200,7 @@ def test_issue1799():
assert len(list(doc.sents)) == 1
@pytest.mark.issue(1807)
def test_issue1807():
"""Test vocab.set_vector also adds the word to the vocab."""
vocab = Vocab(vectors_name="test_issue1807")
@ -194,6 +209,7 @@ def test_issue1807():
assert "hello" in vocab
@pytest.mark.issue(1834)
def test_issue1834():
"""Test that sentence boundaries & parse/tag flags are not lost
during serialization."""
@ -217,6 +233,7 @@ def test_issue1834():
assert new_doc.has_annotation("TAG")
@pytest.mark.issue(1868)
def test_issue1868():
"""Test Vocab.__contains__ works with int keys."""
vocab = Vocab()
@ -228,6 +245,7 @@ def test_issue1868():
assert int_id not in vocab
@pytest.mark.issue(1883)
def test_issue1883():
matcher = Matcher(Vocab())
matcher.add("pat1", [[{"orth": "hello"}]])
@ -239,11 +257,13 @@ def test_issue1883():
@pytest.mark.parametrize("word", ["the"])
@pytest.mark.issue(1889)
def test_issue1889(word):
assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS)
@pytest.mark.skip(reason="obsolete with the config refactor of v.3")
@pytest.mark.issue(1915)
def test_issue1915():
cfg = {"hidden_depth": 2} # should error out
nlp = Language()
@ -253,6 +273,7 @@ def test_issue1915():
nlp.initialize(**cfg)
@pytest.mark.issue(1945)
def test_issue1945():
"""Test regression in Matcher introduced in v2.0.6."""
matcher = Matcher(Vocab())
@ -264,6 +285,7 @@ def test_issue1945():
assert matches[1][1:] == (1, 3)
@pytest.mark.issue(1963)
def test_issue1963(en_tokenizer):
"""Test that doc.merge() resizes doc.tensor"""
doc = en_tokenizer("a b c d")
@ -275,6 +297,7 @@ def test_issue1963(en_tokenizer):
@pytest.mark.parametrize("label", ["U-JOB-NAME"])
@pytest.mark.issue(1967)
def test_issue1967(label):
nlp = Language()
config = {}
@ -293,6 +316,7 @@ def test_issue1967(label):
assert "JOB-NAME" in ner.moves.get_actions(examples=[example])[1]
@pytest.mark.issue(1971)
def test_issue1971(en_vocab):
# Possibly related to #2675 and #2671?
matcher = Matcher(en_vocab)

View File

@ -13,6 +13,7 @@ from ..util import add_vecs_to_vocab
@pytest.mark.skip(
reason="Can not be fixed without iterative looping between prefix/suffix and infix"
)
@pytest.mark.issue(2070)
def test_issue2070():
"""Test that checks that a dot followed by a quote is handled
appropriately.
@ -25,6 +26,7 @@ def test_issue2070():
assert len(doc) == 11
@pytest.mark.issue(2179)
def test_issue2179():
"""Test that spurious 'extra_labels' aren't created when initializing NER."""
nlp = Italian()
@ -41,6 +43,7 @@ def test_issue2179():
assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP",)
@pytest.mark.issue(2203)
def test_issue2203(en_vocab):
"""Test that lemmas are set correctly in doc.from_array."""
words = ["I", "'ll", "survive"]
@ -61,6 +64,7 @@ def test_issue2203(en_vocab):
assert [t.lemma_ for t in new_doc] == lemmas
@pytest.mark.issue(2219)
def test_issue2219(en_vocab):
vectors = [("a", [1, 2, 3]), ("letter", [4, 5, 6])]
add_vecs_to_vocab(en_vocab, vectors)
@ -69,6 +73,7 @@ def test_issue2219(en_vocab):
assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0])
@pytest.mark.issue(2361)
def test_issue2361(de_vocab):
chars = ("&lt;", "&gt;", "&amp;", "&quot;")
words = ["<", ">", "&", '"']
@ -78,6 +83,7 @@ def test_issue2361(de_vocab):
assert char in html
@pytest.mark.issue(2385)
def test_issue2385():
"""Test that IOB tags are correctly converted to BILUO tags."""
# fix bug in labels with a 'b' character
@ -99,11 +105,13 @@ def test_issue2385():
("U-BRAWLER", "U-BRAWLER"),
],
)
@pytest.mark.issue(2385)
def test_issue2385_biluo(tags):
"""Test that BILUO-compatible tags aren't modified."""
assert iob_to_biluo(tags) == list(tags)
@pytest.mark.issue(2396)
def test_issue2396(en_vocab):
words = ["She", "created", "a", "test", "for", "spacy"]
heads = [1, 1, 3, 1, 3, 4]
@ -125,6 +133,7 @@ def test_issue2396(en_vocab):
assert (span.get_lca_matrix() == matrix).all()
@pytest.mark.issue(2464)
def test_issue2464(en_vocab):
"""Test problem with successive ?. This is the same bug, so putting it here."""
matcher = Matcher(en_vocab)
@ -134,6 +143,7 @@ def test_issue2464(en_vocab):
assert len(matches) == 3
@pytest.mark.issue(2482)
def test_issue2482():
"""Test we can serialize and deserialize a blank NER or parser model."""
nlp = Italian()

View File

@ -13,6 +13,7 @@ import numpy
import random
@pytest.mark.issue(2564)
def test_issue2564():
"""Test the tagger sets has_annotation("TAG") correctly when used via Language.pipe."""
nlp = Language()
@ -26,6 +27,7 @@ def test_issue2564():
assert piped_doc.has_annotation("TAG")
@pytest.mark.issue(2569)
def test_issue2569(en_tokenizer):
"""Test that operator + is greedy."""
doc = en_tokenizer("It is May 15, 1993.")
@ -46,12 +48,14 @@ def test_issue2569(en_tokenizer):
"oow.jspsearch.eventoracleopenworldsearch.technologyoraclesolarissearch.technologystoragesearch.technologylinuxsearch.technologyserverssearch.technologyvirtualizationsearch.technologyengineeredsystemspcodewwmkmppscem:",
],
)
@pytest.mark.issue(2626)
def test_issue2626_2835(en_tokenizer, text):
"""Check that sentence doesn't cause an infinite loop in the tokenizer."""
doc = en_tokenizer(text)
assert doc
@pytest.mark.issue(2656)
def test_issue2656(en_tokenizer):
"""Test that tokenizer correctly splits off punctuation after numbers with
decimal points.
@ -71,6 +75,7 @@ def test_issue2656(en_tokenizer):
assert doc[10].text == "."
@pytest.mark.issue(2671)
def test_issue2671():
"""Ensure the correct entity ID is returned for matches with quantifiers.
See also #2675
@ -94,6 +99,7 @@ def test_issue2671():
assert nlp.vocab.strings[match_id] == pattern_id
@pytest.mark.issue(2728)
def test_issue2728(en_vocab):
"""Test that displaCy ENT visualizer escapes HTML correctly."""
doc = Doc(en_vocab, words=["test", "<RELEASE>", "test"])
@ -105,6 +111,7 @@ def test_issue2728(en_vocab):
assert "&lt;RELEASE&gt;" in html
@pytest.mark.issue(2754)
def test_issue2754(en_tokenizer):
"""Test that words like 'a' and 'a.m.' don't get exceptional norm values."""
a = en_tokenizer("a")
@ -113,6 +120,7 @@ def test_issue2754(en_tokenizer):
assert am[0].norm_ == "am"
@pytest.mark.issue(2772)
def test_issue2772(en_vocab):
"""Test that deprojectivization doesn't mess up sentence boundaries."""
# fmt: off
@ -128,6 +136,7 @@ def test_issue2772(en_vocab):
@pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"])
@pytest.mark.parametrize("lang_cls", [English, MultiLanguage])
@pytest.mark.issue(2782)
def test_issue2782(text, lang_cls):
"""Check that like_num handles + and - before number."""
nlp = lang_cls()
@ -136,6 +145,7 @@ def test_issue2782(text, lang_cls):
assert doc[0].like_num
@pytest.mark.issue(2800)
def test_issue2800():
"""Test issue that arises when too many labels are added to NER model.
Used to cause segfault.
@ -157,6 +167,7 @@ def test_issue2800():
nlp.update([example], sgd=optimizer, losses=losses, drop=0.5)
@pytest.mark.issue(2822)
def test_issue2822(it_tokenizer):
"""Test that the abbreviation of poco is kept as one word."""
doc = it_tokenizer("Vuoi un po' di zucchero?")
@ -169,6 +180,7 @@ def test_issue2822(it_tokenizer):
assert doc[5].text == "?"
@pytest.mark.issue(2833)
def test_issue2833(en_vocab):
"""Test that a custom error is raised if a token or span is pickled."""
doc = Doc(en_vocab, words=["Hello", "world"])
@ -178,6 +190,7 @@ def test_issue2833(en_vocab):
pickle.dumps(doc[0:2])
@pytest.mark.issue(2871)
def test_issue2871():
"""Test that vectors recover the correct key for spaCy reserved words."""
words = ["dog", "cat", "SUFFIX"]
@ -196,6 +209,7 @@ def test_issue2871():
assert vocab.vectors.find(key="SUFFIX") == 2
@pytest.mark.issue(2901)
def test_issue2901():
"""Test that `nlp` doesn't fail."""
try:
@ -207,6 +221,7 @@ def test_issue2901():
assert doc
@pytest.mark.issue(2926)
def test_issue2926(fr_tokenizer):
"""Test that the tokenizer correctly splits tokens separated by a slash (/)
ending in a digit.

View File

@ -14,6 +14,7 @@ from spacy.vectors import Vectors
import numpy
@pytest.mark.issue(3002)
def test_issue3002():
"""Test that the tokenizer doesn't hang on a long list of dots"""
nlp = German()
@ -23,6 +24,7 @@ def test_issue3002():
assert len(doc) == 5
@pytest.mark.issue(3009)
def test_issue3009(en_vocab):
"""Test problem with matcher quantifiers"""
patterns = [
@ -53,6 +55,7 @@ def test_issue3009(en_vocab):
assert matches
@pytest.mark.issue(3012)
def test_issue3012(en_vocab):
"""Test that the is_tagged attribute doesn't get overwritten when we from_array
without tag information."""
@ -74,6 +77,7 @@ def test_issue3012(en_vocab):
assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected
@pytest.mark.issue(3199)
def test_issue3199():
"""Test that Span.noun_chunks works correctly if no noun chunks iterator
is available. To make this test future-proof, we're constructing a Doc
@ -85,6 +89,7 @@ def test_issue3199():
list(doc[0:3].noun_chunks)
@pytest.mark.issue(3209)
def test_issue3209():
"""Test issue that occurred in spaCy nightly where NER labels were being
mapped to classes incorrectly after loading the model, when the labels
@ -104,6 +109,7 @@ def test_issue3209():
assert ner2.move_names == move_names
@pytest.mark.issue(3248)
def test_issue3248_1():
"""Test that the PhraseMatcher correctly reports its number of rules, not
total number of patterns."""
@ -114,6 +120,7 @@ def test_issue3248_1():
assert len(matcher) == 2
@pytest.mark.issue(3248)
def test_issue3248_2():
"""Test that the PhraseMatcher can be pickled correctly."""
nlp = English()
@ -125,6 +132,7 @@ def test_issue3248_2():
assert len(new_matcher) == len(matcher)
@pytest.mark.issue(3277)
def test_issue3277(es_tokenizer):
"""Test that hyphens are split correctly as prefixes."""
doc = es_tokenizer("—Yo me llamo... murmuró el niño Emilio Sánchez Pérez.")
@ -134,6 +142,7 @@ def test_issue3277(es_tokenizer):
assert doc[9].text == "\u2013"
@pytest.mark.issue(3288)
def test_issue3288(en_vocab):
"""Test that retokenization works correctly via displaCy when punctuation
is merged onto the preceeding token and tensor is resized."""
@ -145,6 +154,7 @@ def test_issue3288(en_vocab):
displacy.render(doc)
@pytest.mark.issue(3289)
def test_issue3289():
"""Test that Language.to_bytes handles serializing a pipeline component
with an uninitialized model."""
@ -156,6 +166,7 @@ def test_issue3289():
new_nlp.from_bytes(bytes_data)
@pytest.mark.issue(3328)
def test_issue3328(en_vocab):
doc = Doc(en_vocab, words=["Hello", ",", "how", "are", "you", "doing", "?"])
matcher = Matcher(en_vocab)
@ -170,6 +181,7 @@ def test_issue3328(en_vocab):
assert matched_texts == ["Hello", "how", "you", "doing"]
@pytest.mark.issue(3331)
def test_issue3331(en_vocab):
"""Test that duplicate patterns for different rules result in multiple
matches, one per rule.
@ -184,6 +196,7 @@ def test_issue3331(en_vocab):
assert sorted(match_ids) == ["A", "B"]
@pytest.mark.issue(3345)
def test_issue3345():
"""Test case where preset entity crosses sentence boundary."""
nlp = English()
@ -206,6 +219,7 @@ def test_issue3345():
assert ner.moves.is_valid(state, "B-GPE")
@pytest.mark.issue(3412)
def test_issue3412():
data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f")
vectors = Vectors(data=data, keys=["A", "B", "C"])
@ -216,6 +230,7 @@ def test_issue3412():
@pytest.mark.skip(reason="default suffix rules avoid one upper-case letter before dot")
@pytest.mark.issue(3449)
def test_issue3449():
nlp = English()
nlp.add_pipe("sentencizer")
@ -230,6 +245,7 @@ def test_issue3449():
assert t3[5].text == "I"
@pytest.mark.issue(3456)
def test_issue3456():
# this crashed because of a padding error in layer.ops.unflatten in thinc
nlp = English()
@ -239,6 +255,7 @@ def test_issue3456():
list(nlp.pipe(["hi", ""]))
@pytest.mark.issue(3468)
def test_issue3468():
"""Test that sentence boundaries are set correctly so Doc.has_annotation("SENT_START") can
be restored after serialization."""

View File

@ -24,6 +24,7 @@ from ..util import make_tempdir
@pytest.mark.parametrize("word", ["don't", "dont", "I'd", "Id"])
@pytest.mark.issue(3521)
def test_issue3521(en_tokenizer, word):
tok = en_tokenizer(word)[1]
# 'not' and 'would' should be stopwords, also in their abbreviated forms
@ -108,6 +109,7 @@ def test_issue_3526_4(en_vocab):
assert new_ruler.overwrite is True
@pytest.mark.issue(3531)
def test_issue3531():
"""Test that displaCy renderer doesn't require "settings" key."""
example_dep = {
@ -137,6 +139,7 @@ def test_issue3531():
assert ent_html
@pytest.mark.issue(3540)
def test_issue3540(en_vocab):
words = ["I", "live", "in", "NewYork", "right", "now"]
tensor = numpy.asarray(
@ -176,6 +179,7 @@ def test_issue3540(en_vocab):
assert vectors_1[5].tolist() == vectors_2[6].tolist()
@pytest.mark.issue(3549)
def test_issue3549(en_vocab):
"""Test that match pattern validation doesn't raise on empty errors."""
matcher = Matcher(en_vocab, validate=True)
@ -186,6 +190,7 @@ def test_issue3549(en_vocab):
@pytest.mark.skip("Matching currently only works on strings and integers")
@pytest.mark.issue(3555)
def test_issue3555(en_vocab):
"""Test that custom extensions with default None don't break matcher."""
Token.set_extension("issue3555", default=None)
@ -196,6 +201,7 @@ def test_issue3555(en_vocab):
matcher(doc)
@pytest.mark.issue(3611)
def test_issue3611():
"""Test whether adding n-grams in the textcat works even when n > token length of some docs"""
unique_classes = ["offensive", "inoffensive"]
@ -232,6 +238,7 @@ def test_issue3611():
nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
@pytest.mark.issue(3625)
def test_issue3625():
"""Test that default punctuation rules applies to hindi unicode characters"""
nlp = Hindi()
@ -240,6 +247,7 @@ def test_issue3625():
assert [token.text for token in doc] == expected
@pytest.mark.issue(3803)
def test_issue3803():
"""Test that spanish num-like tokens have True for like_num attribute."""
nlp = Spanish()
@ -255,6 +263,7 @@ def _parser_example(parser):
return Example.from_dict(doc, gold)
@pytest.mark.issue(3830)
def test_issue3830_no_subtok():
"""Test that the parser doesn't have subtok label if not learn_tokens"""
config = {
@ -268,6 +277,7 @@ def test_issue3830_no_subtok():
assert "subtok" not in parser.labels
@pytest.mark.issue(3830)
def test_issue3830_with_subtok():
"""Test that the parser does have subtok label if learn_tokens=True."""
config = {
@ -281,6 +291,7 @@ def test_issue3830_with_subtok():
assert "subtok" in parser.labels
@pytest.mark.issue(3839)
def test_issue3839(en_vocab):
"""Test that match IDs returned by the matcher are correct, are in the string"""
doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
@ -307,6 +318,7 @@ def test_issue3839(en_vocab):
"It was a missed assignment, but it shouldn't have resulted in a turnover ...",
],
)
@pytest.mark.issue(3869)
def test_issue3869(sentence):
"""Test that the Doc's count_by function works consistently"""
nlp = English()
@ -317,6 +329,7 @@ def test_issue3869(sentence):
assert count == doc.count_by(IS_ALPHA).get(1, 0)
@pytest.mark.issue(3879)
def test_issue3879(en_vocab):
doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
assert len(doc) == 5
@ -326,6 +339,7 @@ def test_issue3879(en_vocab):
assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test'
@pytest.mark.issue(3880)
def test_issue3880():
"""Test that `nlp.pipe()` works when an empty string ends the batch.
@ -341,6 +355,7 @@ def test_issue3880():
pass
@pytest.mark.issue(3882)
def test_issue3882(en_vocab):
"""Test that displaCy doesn't serialize the doc.user_data when making a
copy of the Doc.
@ -350,6 +365,7 @@ def test_issue3882(en_vocab):
parse_deps(doc)
@pytest.mark.issue(3951)
def test_issue3951(en_vocab):
"""Test that combinations of optional rules are matched correctly."""
matcher = Matcher(en_vocab)
@ -365,6 +381,7 @@ def test_issue3951(en_vocab):
assert len(matches) == 0
@pytest.mark.issue(3959)
def test_issue3959():
"""Ensure that a modified pos attribute is serialized correctly."""
nlp = English()
@ -383,6 +400,7 @@ def test_issue3959():
assert doc2[0].pos_ == "NOUN"
@pytest.mark.issue(3962)
def test_issue3962(en_vocab):
"""Ensure that as_doc does not result in out-of-bound access of tokens.
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
@ -421,6 +439,7 @@ def test_issue3962(en_vocab):
assert len(list(doc3.sents)) == 1
@pytest.mark.issue(3962)
def test_issue3962_long(en_vocab):
"""Ensure that as_doc does not result in out-of-bound access of tokens.
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
@ -456,6 +475,7 @@ def test_issue3962_long(en_vocab):
assert sents[1].text == "They never"
@pytest.mark.issue(3972)
def test_issue3972(en_vocab):
"""Test that the PhraseMatcher returns duplicates for duplicate match IDs."""
matcher = PhraseMatcher(en_vocab)

View File

@ -17,6 +17,7 @@ from thinc.api import compounding
from ..util import make_tempdir
@pytest.mark.issue(4002)
def test_issue4002(en_vocab):
"""Test that the PhraseMatcher can match on overwritten NORM attributes."""
matcher = PhraseMatcher(en_vocab, attr="NORM")
@ -37,6 +38,7 @@ def test_issue4002(en_vocab):
assert len(matches) == 1
@pytest.mark.issue(4030)
def test_issue4030():
"""Test whether textcat works fine with empty doc"""
unique_classes = ["offensive", "inoffensive"]
@ -77,6 +79,7 @@ def test_issue4030():
assert doc.cats["inoffensive"] == 0.0
@pytest.mark.issue(4042)
def test_issue4042():
"""Test that serialization of an EntityRuler before NER works fine."""
nlp = English()
@ -105,6 +108,7 @@ def test_issue4042():
assert doc2.ents[0].label_ == "MY_ORG"
@pytest.mark.issue(4042)
def test_issue4042_bug2():
"""
Test that serialization of an NER works fine when new labels were added.
@ -139,6 +143,7 @@ def test_issue4042_bug2():
assert len(ner2.labels) == 2
@pytest.mark.issue(4054)
def test_issue4054(en_vocab):
"""Test that a new blank model can be made with a vocab from file,
and that serialization does not drop the language at any point."""
@ -159,6 +164,7 @@ def test_issue4054(en_vocab):
assert nlp3.lang == "en"
@pytest.mark.issue(4120)
def test_issue4120(en_vocab):
"""Test that matches without a final {OP: ?} token are returned."""
matcher = Matcher(en_vocab)
@ -177,6 +183,7 @@ def test_issue4120(en_vocab):
assert len(matcher(doc4)) == 3 # fixed
@pytest.mark.issue(4133)
def test_issue4133(en_vocab):
nlp = English()
vocab_bytes = nlp.vocab.to_bytes()
@ -196,6 +203,7 @@ def test_issue4133(en_vocab):
assert actual == pos
@pytest.mark.issue(4190)
def test_issue4190():
def customize_tokenizer(nlp):
prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
@ -236,6 +244,7 @@ def test_issue4190():
assert result_1b == result_2
@pytest.mark.issue(4267)
def test_issue4267():
"""Test that running an entity_ruler after ner gives consistent results"""
nlp = English()
@ -262,6 +271,7 @@ def test_issue4267():
@pytest.mark.skip(reason="lemmatizer lookups no longer in vocab")
@pytest.mark.issue(4272)
def test_issue4272():
"""Test that lookup table can be accessed from Token.lemma if no POS tags
are available."""
@ -287,6 +297,7 @@ def test_multiple_predictions():
dummy_pipe(doc)
@pytest.mark.issue(4313)
def test_issue4313():
"""This should not crash or exit with some strange error code"""
beam_width = 16
@ -313,6 +324,7 @@ def test_issue4313():
assert "MY_ORG" in ner.labels
@pytest.mark.issue(4348)
def test_issue4348():
"""Test that training the tagger with empty data, doesn't throw errors"""
nlp = English()
@ -328,6 +340,7 @@ def test_issue4348():
nlp.update(batch, sgd=optimizer, losses=losses)
@pytest.mark.issue(4367)
def test_issue4367():
"""Test that docbin init goes well"""
DocBin()
@ -335,6 +348,7 @@ def test_issue4367():
DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
@pytest.mark.issue(4373)
def test_issue4373():
"""Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
matcher = Matcher(Vocab())
@ -343,6 +357,7 @@ def test_issue4373():
assert isinstance(matcher.vocab, Vocab)
@pytest.mark.issue(4402)
def test_issue4402():
json_data = {
"id": 0,

View File

@ -14,6 +14,7 @@ from thinc.api import NumpyOps, get_current_ops
from ..util import make_tempdir
@pytest.mark.issue(4528)
def test_issue4528(en_vocab):
"""Test that user_data is correctly serialized in DocBin."""
doc = Doc(en_vocab, words=["hello", "world"])
@ -37,6 +38,7 @@ def test_gold_misaligned(en_tokenizer, text, words):
Example.from_dict(doc, {"words": words})
@pytest.mark.issue(4651)
def test_issue4651_with_phrase_matcher_attr():
"""Test that the EntityRuler PhraseMatcher is deserialized correctly using
the method from_disk when the EntityRuler argument phrase_matcher_attr is
@ -59,6 +61,7 @@ def test_issue4651_with_phrase_matcher_attr():
assert res == res_reloaded
@pytest.mark.issue(4651)
def test_issue4651_without_phrase_matcher_attr():
"""Test that the EntityRuler PhraseMatcher is deserialized correctly using
the method from_disk when the EntityRuler argument phrase_matcher_attr is
@ -81,6 +84,7 @@ def test_issue4651_without_phrase_matcher_attr():
assert res == res_reloaded
@pytest.mark.issue(4665)
def test_issue4665():
"""
conllu_to_docs should not raise an exception if the HEAD column contains an
@ -109,6 +113,7 @@ def test_issue4665():
conllu_to_docs(input_data)
@pytest.mark.issue(4674)
def test_issue4674():
"""Test that setting entities with overlapping identifiers does not mess up IO"""
nlp = English()
@ -135,6 +140,7 @@ def test_issue4674():
@pytest.mark.skip(reason="API change: disable just disables, new exclude arg")
@pytest.mark.issue(4707)
def test_issue4707():
"""Tests that disabled component names are also excluded from nlp.from_disk
by default when loading a model.
@ -151,6 +157,7 @@ def test_issue4707():
assert "entity_ruler" in new_nlp.pipe_names
@pytest.mark.issue(4725)
def test_issue4725_1():
"""Ensure the pickling of the NER goes well"""
vocab = Vocab(vectors_name="test_vocab_add_vector")
@ -169,6 +176,7 @@ def test_issue4725_1():
assert ner2.cfg["update_with_oracle_cut_size"] == 111
@pytest.mark.issue(4725)
def test_issue4725_2():
if isinstance(get_current_ops, NumpyOps):
# ensures that this runs correctly and doesn't hang or crash because of the global vectors
@ -188,6 +196,7 @@ def test_issue4725_2():
pass
@pytest.mark.issue(4849)
def test_issue4849():
nlp = English()
patterns = [
@ -235,6 +244,7 @@ class CustomPipe:
return str(span.end)
@pytest.mark.issue(4903)
def test_issue4903():
"""Ensure that this runs correctly and doesn't hang or crash on Windows /
macOS."""
@ -249,6 +259,7 @@ def test_issue4903():
assert docs[2].text == "No, I prefer wasabi."
@pytest.mark.issue(4924)
def test_issue4924():
nlp = Language()
example = Example.from_dict(nlp.make_doc(""), {})

View File

@ -12,6 +12,7 @@ import pytest
from ...util import make_tempdir
@pytest.mark.issue(5048)
def test_issue5048(en_vocab):
words = ["This", "is", "a", "sentence"]
pos_s = ["DET", "VERB", "DET", "NOUN"]
@ -34,6 +35,7 @@ def test_issue5048(en_vocab):
assert v1 == v2
@pytest.mark.issue(5082)
def test_issue5082():
# Ensure the 'merge_entities' pipeline does something sensible for the vectors of the merged tokens
nlp = English()
@ -68,6 +70,7 @@ def test_issue5082():
numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[2]), array34)
@pytest.mark.issue(5137)
def test_issue5137():
factory_name = "test_issue5137"
pipe_name = "my_component"
@ -98,6 +101,7 @@ def test_issue5137():
assert nlp2.get_pipe(pipe_name).categories == "my_categories"
@pytest.mark.issue(5141)
def test_issue5141(en_vocab):
"""Ensure an empty DocBin does not crash on serialization"""
doc_bin = DocBin(attrs=["DEP", "HEAD"])
@ -107,6 +111,7 @@ def test_issue5141(en_vocab):
assert list(doc_bin_2.get_docs(en_vocab)) == []
@pytest.mark.issue(5152)
def test_issue5152():
# Test that the comparison between a Span and a Token, goes well
# There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
@ -125,6 +130,7 @@ def test_issue5152():
assert span_2.similarity(span_3) < 1.0
@pytest.mark.issue(5458)
def test_issue5458():
# Test that the noun chuncker does not generate overlapping spans
# fmt: off

View File

@ -25,6 +25,7 @@ from spacy.training import Example
multi_label_cnn_config,
],
)
@pytest.mark.issue(5551)
def test_issue5551(textcat_config):
"""Test that after fixing the random seed, the results of the pipeline are truly identical"""
component = "textcat"
@ -53,6 +54,7 @@ def test_issue5551(textcat_config):
assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]), decimal=5)
@pytest.mark.issue(5838)
def test_issue5838():
# Displacy's EntityRenderer break line
# not working after last entity
@ -65,6 +67,7 @@ def test_issue5838():
assert found == 4
@pytest.mark.issue(5918)
def test_issue5918():
# Test edge case when merging entities.
nlp = English()

View File

@ -4,6 +4,7 @@ from spacy.schemas import TokenPattern, TokenPatternSchema
import pytest
@pytest.mark.issue(6207)
def test_issue6207(en_tokenizer):
doc = en_tokenizer("zero one two three four five six")
@ -18,6 +19,7 @@ def test_issue6207(en_tokenizer):
assert s3 in result
@pytest.mark.issue(6258)
def test_issue6258():
"""Test that the non-empty constraint pattern field is respected"""
# These one is valid

View File

@ -13,6 +13,7 @@ import pickle
from ..util import make_tempdir
@pytest.mark.issue(6730)
def test_issue6730(en_vocab):
"""Ensure that the KB does not accept empty strings, but otherwise IO works fine."""
from spacy.kb import KnowledgeBase
@ -34,6 +35,7 @@ def test_issue6730(en_vocab):
assert set(kb.get_alias_strings()) == {"x", "y"}
@pytest.mark.issue(6755)
def test_issue6755(en_tokenizer):
doc = en_tokenizer("This is a magnificent sentence.")
span = doc[:0]
@ -45,6 +47,7 @@ def test_issue6755(en_tokenizer):
"sentence, start_idx,end_idx,label",
[("Welcome to Mumbai, my friend", 11, 17, "GPE")],
)
@pytest.mark.issue(6815)
def test_issue6815_1(sentence, start_idx, end_idx, label):
nlp = English()
doc = nlp(sentence)
@ -55,6 +58,7 @@ def test_issue6815_1(sentence, start_idx, end_idx, label):
@pytest.mark.parametrize(
"sentence, start_idx,end_idx,kb_id", [("Welcome to Mumbai, my friend", 11, 17, 5)]
)
@pytest.mark.issue(6815)
def test_issue6815_2(sentence, start_idx, end_idx, kb_id):
nlp = English()
doc = nlp(sentence)
@ -66,6 +70,7 @@ def test_issue6815_2(sentence, start_idx, end_idx, kb_id):
"sentence, start_idx,end_idx,vector",
[("Welcome to Mumbai, my friend", 11, 17, np.array([0.1, 0.2, 0.3]))],
)
@pytest.mark.issue(6815)
def test_issue6815_3(sentence, start_idx, end_idx, vector):
nlp = English()
doc = nlp(sentence)
@ -73,6 +78,7 @@ def test_issue6815_3(sentence, start_idx, end_idx, vector):
assert (span.vector == vector).all()
@pytest.mark.issue(6839)
def test_issue6839(en_vocab):
"""Ensure that PhraseMatcher accepts Span as input"""
# fmt: off
@ -155,6 +161,7 @@ labels = ['label1', 'label2']
"component_name",
["textcat", "textcat_multilabel"],
)
@pytest.mark.issue(6908)
def test_issue6908(component_name):
"""Test intializing textcat with labels in a list"""
@ -219,6 +226,7 @@ upstream = "*"
"""
@pytest.mark.issue(6950)
def test_issue6950():
"""Test that the nlp object with initialized tok2vec with listeners pickles
correctly (and doesn't have lambdas).

View File

@ -1,3 +1,4 @@
import pytest
from spacy.cli.evaluate import print_textcats_auc_per_cat, print_prf_per_type
from spacy.lang.en import English
from spacy.training import Example
@ -13,6 +14,7 @@ from wasabi import msg
from ..util import make_tempdir
@pytest.mark.issue(7019)
def test_issue7019():
scores = {"LABEL_A": 0.39829102, "LABEL_B": 0.938298329382, "LABEL_C": None}
print_textcats_auc_per_cat(msg, scores)
@ -64,6 +66,7 @@ upstream = "*"
"""
@pytest.mark.issue(7029)
def test_issue7029():
"""Test that an empty document doesn't mess up an entire batch."""
TRAIN_DATA = [
@ -84,6 +87,7 @@ def test_issue7029():
assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]]
@pytest.mark.issue(7055)
def test_issue7055():
"""Test that fill-config doesn't turn sourced components into factories."""
source_cfg = {
@ -118,6 +122,7 @@ def test_issue7055():
assert "model" in filled_cfg["components"]["ner"]
@pytest.mark.issue(7056)
def test_issue7056():
"""Test that the Unshift transition works properly, and doesn't cause
sentence segmentation errors."""
@ -190,6 +195,7 @@ def test_partial_links():
assert "ORG" not in results["nel_f_per_type"]
@pytest.mark.issue(7065)
def test_issue7065():
text = "Kathleen Battle sang in Mahler 's Symphony No. 8 at the Cincinnati Symphony Orchestra 's May Festival."
nlp = English()
@ -217,6 +223,7 @@ def test_issue7065():
assert sentences.index(ent.sent) == 0
@pytest.mark.issue(7065)
def test_issue7065_b():
# Test that the NEL doesn't crash when an entity crosses a sentence boundary
nlp = English()

View File

@ -43,6 +43,7 @@ def parser(vocab):
return parser
@pytest.mark.issue(7716)
@pytest.mark.xfail(reason="Not fixed yet")
def test_partial_annotation(parser):
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])

View File

@ -1,8 +1,11 @@
import pytest
import spacy
from spacy.lang.en import English
from ..util import make_tempdir
@pytest.mark.issue(8190)
def test_issue8190():
"""Test that config overrides are not lost after load is complete."""
source_cfg = {

View File

@ -22,6 +22,7 @@ def patterns():
]
@pytest.mark.issue(8216)
def test_entity_ruler_fix8216(nlp, patterns):
"""Test that patterns don't get added excessively."""
ruler = nlp.add_pipe("entity_ruler", config={"validate": True})

View File

@ -162,6 +162,7 @@ def test_serialize_tagger_strings(en_vocab, de_vocab, taggers):
assert label in tagger2.vocab.strings
@pytest.mark.issue(1105)
def test_serialize_textcat_empty(en_vocab):
# See issue #1105
cfg = {"model": DEFAULT_SINGLE_TEXTCAT_MODEL}

View File

@ -492,7 +492,6 @@ def test_string_to_list_intify(value):
assert string_to_list(value, intify=True) == [1, 2, 3]
@pytest.mark.skip(reason="Temporarily skip until v3.2.0 release")
def test_download_compatibility():
spec = SpecifierSet("==" + about.__version__)
spec.prereleases = False
@ -503,7 +502,6 @@ def test_download_compatibility():
assert get_minor_version(about.__version__) == get_minor_version(version)
@pytest.mark.skip(reason="Temporarily skip until v3.2.0 release")
def test_validate_compatibility_table():
spec = SpecifierSet("==" + about.__version__)
spec.prereleases = False

View File

@ -2,11 +2,10 @@ from inspect import isclass
import pytest
from spacy.errors import add_codes
from spacy.errors import ErrorsWithCodes
@add_codes
class Errors:
class Errors(metaclass=ErrorsWithCodes):
E001 = "error description"

View File

@ -216,8 +216,8 @@ def test_tokenizer_flush_specials(en_vocab):
def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab):
# the prefix and suffix matches overlap in the suffix lookbehind
prefixes = ['a(?=.)']
suffixes = [r'(?<=\w)\.', r'(?<=a)\d+\.']
prefixes = ["a(?=.)"]
suffixes = [r"(?<=\w)\.", r"(?<=a)\d+\."]
prefix_re = compile_prefix_regex(prefixes)
suffix_re = compile_suffix_regex(suffixes)
tokenizer = Tokenizer(

View File

@ -524,6 +524,7 @@ def test_roundtrip_docs_to_docbin(doc):
assert cats["TRAVEL"] == reloaded_example.reference.cats["TRAVEL"]
assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"]
def test_docbin_user_data_serialized(doc):
doc.user_data["check"] = True
nlp = English()
@ -536,6 +537,7 @@ def test_docbin_user_data_serialized(doc):
assert reloaded_doc.user_data["check"] == True
def test_docbin_user_data_not_serialized(doc):
# this isn't serializable, but that shouldn't cause an error
doc.user_data["check"] = set()
@ -549,6 +551,7 @@ def test_docbin_user_data_not_serialized(doc):
assert "check" not in reloaded_doc.user_data
@pytest.mark.parametrize(
"tokens_a,tokens_b,expected",
[

View File

@ -139,8 +139,12 @@ class Doc:
def count_by(
self, attr_id: int, exclude: Optional[Any] = ..., counts: Optional[Any] = ...
) -> Dict[Any, int]: ...
def from_array(self, attrs: Union[int, str, List[Union[int, str]]], array: Ints2d) -> Doc: ...
def to_array(self, py_attr_ids: Union[int, str, List[Union[int, str]]]) -> numpy.ndarray: ...
def from_array(
self, attrs: Union[int, str, List[Union[int, str]]], array: Ints2d
) -> Doc: ...
def to_array(
self, py_attr_ids: Union[int, str, List[Union[int, str]]]
) -> numpy.ndarray: ...
@staticmethod
def from_docs(
docs: List[Doc],

View File

@ -41,7 +41,10 @@ def create_docbin_reader(
@util.registry.readers("spacy.JsonlCorpus.v1")
def create_jsonl_reader(
path: Union[str, Path], min_length: int = 0, max_length: int = 0, limit: int = 0
path: Optional[Union[str, Path]],
min_length: int = 0,
max_length: int = 0,
limit: int = 0,
) -> Callable[["Language"], Iterable[Example]]:
return JsonlCorpus(path, min_length=min_length, max_length=max_length, limit=limit)
@ -221,7 +224,7 @@ class JsonlCorpus:
def __init__(
self,
path: Union[str, Path],
path: Optional[Union[str, Path]],
*,
limit: int = 0,
min_length: int = 0,

View File

@ -50,7 +50,9 @@ def pretrain(
# TODO: move this to logger function?
tracker = ProgressTracker(frequency=10000)
if P["n_save_epoch"]:
msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume} - saving every {P['n_save_epoch']} epoch")
msg.divider(
f"Pre-training tok2vec layer - starting at epoch {epoch_resume} - saving every {P['n_save_epoch']} epoch"
)
else:
msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}

View File

@ -288,16 +288,17 @@ def find_matching_language(lang: str) -> Optional[str]:
None
"""
import spacy.lang # noqa: F401
if lang == 'xx':
return 'xx'
if lang == "xx":
return "xx"
# Find out which language modules we have
possible_languages = []
for modinfo in pkgutil.iter_modules(spacy.lang.__path__): # type: ignore
code = modinfo.name
if code == 'xx':
if code == "xx":
# Temporarily make 'xx' into a valid language code
possible_languages.append('mul')
possible_languages.append("mul")
elif langcodes.tag_is_valid(code):
possible_languages.append(code)
@ -306,12 +307,10 @@ def find_matching_language(lang: str) -> Optional[str]:
# more possibilities, like variants of Chinese like 'wuu', but text that
# is labeled that way is probably trying to be distinct from 'zh' and
# shouldn't automatically match.
match = langcodes.closest_supported_match(
lang, possible_languages, max_distance=9
)
if match == 'mul':
match = langcodes.closest_supported_match(lang, possible_languages, max_distance=9)
if match == "mul":
# Convert 'mul' back to spaCy's 'xx'
return 'xx'
return "xx"
else:
return match

View File

@ -82,7 +82,7 @@ consisting of a CNN and a layer-normalized maxout activation function.
| `width` | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~ |
| `depth` | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~ |
| `embed_size` | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~ |
| `window_size` | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * (window_size * 2 + 1)`, so a 4-layer network with a window size of `2` will be sensitive to 17 words at a time. Recommended value is `1`. ~~int~~ |
| `window_size` | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * (window_size * 2 + 1)`, so a 4-layer network with a window size of `2` will be sensitive to 20 words at a time. Recommended value is `1`. ~~int~~ |
| `maxout_pieces` | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. ~~int~~ |
| `subword_features` | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. ~~bool~~ |
| `pretrained_vectors` | Whether to also use static vectors. ~~bool~~ |

View File

@ -1000,6 +1000,11 @@ subclasses like `English` or `German` to make language-specific functionality
like the [lexical attribute getters](/usage/linguistic-features#language-data)
available to the loaded object.
Note that if you want to serialize and reload a whole pipeline, using this alone
won't work, you also need to handle the config. See
["Serializing the pipeline"](https://spacy.io/usage/saving-loading#pipeline) for
details.
> #### Example
>
> ```python

View File

@ -105,7 +105,7 @@ coarse-grained part-of-speech tags and morphological features.
that the verb is past tense (e.g. `VBD` for a past tense verb in the Penn
Treebank) .
2. For words whose coarse-grained POS is not set by a prior process, a
[mapping table](#mapping-exceptions) maps the fine-grained tags to a
[mapping table](#mappings-exceptions) maps the fine-grained tags to a
coarse-grained POS tags and morphological features.
```python

244
website/docs/usage/v3-2.md Normal file
View File

@ -0,0 +1,244 @@
---
title: What's New in v3.2
teaser: New features and how to upgrade
menu:
- ['New Features', 'features']
- ['Upgrading Notes', 'upgrading']
---
## New Features {#features hidden="true"}
spaCy v3.2 adds support for [`floret`](https://github.com/explosion/floret)
vectors, makes custom `Doc` creation and scoring easier, and includes many bug
fixes and improvements. For the trained pipelines, there's a new transformer
pipeline for Japanese and the Universal Dependencies training data has been
updated across the board to the most recent release.
<Infobox title="Improve performance for spaCy on Apple M1 with AppleOps" variant="warning" emoji="📣">
spaCy is now up to **8 &times; faster on M1 Macs** by calling into Apple's
native Accelerate library for matrix multiplication. For more details, see
[`thinc-apple-ops`](https://github.com/explosion/thinc-apple-ops).
```bash
$ pip install spacy[apple]
```
</Infobox>
### Registered scoring functions {#registered-scoring-functions}
To customize the scoring, you can specify a scoring function for each component
in your config from the new [`scorers` registry](/api/top-level#registry):
```ini
### config.cfg (excerpt) {highlight="3"}
[components.tagger]
factory = "tagger"
scorer = {"@scorers":"spacy.tagger_scorer.v1"}
```
### Overwrite settings {#overwrite}
Most pipeline components now include an `overwrite` setting in the config that
determines whether existing annotation in the `Doc` is preserved or overwritten:
```ini
### config.cfg (excerpt) {highlight="3"}
[components.tagger]
factory = "tagger"
overwrite = false
```
### Doc input for pipelines {#doc-input}
[`nlp`](/api/language#call) and [`nlp.pipe`](/api/language#pipe) accept
[`Doc`](/api/doc) input, skipping the tokenizer if a `Doc` is provided instead
of a string. This makes it easier to create a `Doc` with custom tokenization or
to set custom extensions before processing:
```python
doc = nlp.make_doc("This is text 500.")
doc._.text_id = 500
doc = nlp(doc)
```
### Support for floret vectors {#vectors}
We recently published [`floret`](https://github.com/explosion/floret), an
extended version of [fastText](https://fasttext.cc) that combines fastText's
subwords with Bloom embeddings for compact, full-coverage vectors. The use of
subwords means that there are no OOV words and due to Bloom embeddings, the
vector table can be kept very small at <100K entries. Bloom embeddings are
already used by [HashEmbed](https://thinc.ai/docs/api-layers#hashembed) in
[tok2vec](/api/architectures#tok2vec-arch) for compact spaCy models.
For easy integration, floret includes a
[Python wrapper](https://github.com/explosion/floret/blob/main/python/README.md):
```bash
$ pip install floret
```
A demo project shows how to train and import floret vectors:
<Project id="pipelines/floret_vectors_demo">
Train toy English floret vectors and import them into a spaCy pipeline.
</Project>
Two additional demo projects compare standard fastText vectors with floret
vectors for full spaCy pipelines. For agglutinative languages like Finnish or
Korean, there are large improvements in performance due to the use of subwords
(no OOV words!), with a vector table containing merely 50K entries.
<Project id="pipelines/floret_fi_core_demo">
Finnish UD+NER vector and pipeline training, comparing standard fasttext vs.
floret vectors.
For the default project settings with 1M (2.6G) tokenized training texts and 50K
300-dim vectors, ~300K keys for the standard vectors:
| Vectors | TAG | POS | DEP UAS | DEP LAS | NER F |
| -------------------------------------------- | -------: | -------: | -------: | -------: | -------: |
| none | 93.3 | 92.3 | 79.7 | 72.8 | 61.0 |
| standard (pruned: 50K vectors for 300K keys) | 95.9 | 94.7 | 83.3 | 77.9 | 68.5 |
| standard (unpruned: 300K vectors/keys) | 96.0 | 95.0 | **83.8** | 78.4 | 69.1 |
| floret (minn 4, maxn 5; 50K vectors, no OOV) | **96.6** | **95.5** | 83.5 | **78.5** | **70.9** |
</Project>
<Project id="pipelines/floret_ko_ud_demo">
Korean UD vector and pipeline training, comparing standard fasttext vs. floret
vectors.
For the default project settings with 1M (3.3G) tokenized training texts and 50K
300-dim vectors, ~800K keys for the standard vectors:
| Vectors | TAG | POS | DEP UAS | DEP LAS |
| -------------------------------------------- | -------: | -------: | -------: | -------: |
| none | 72.5 | 85.0 | 73.2 | 64.3 |
| standard (pruned: 50K vectors for 800K keys) | 77.9 | 89.4 | 78.8 | 72.8 |
| standard (unpruned: 800K vectors/keys) | 79.0 | 90.2 | 79.2 | 73.9 |
| floret (minn 2, maxn 3; 50K vectors, no OOV) | **82.5** | **93.8** | **83.0** | **80.1** |
</Project>
### Updates for spacy-transformers v1.1 {#spacy-transformers}
[`spacy-transformers`](https://github.com/explosion/spacy-transformers) v1.1 has
been refactored to improve serialization and support of inline transformer
components and replacing listeners. In addition, the transformer model output is
provided as
[`ModelOutput`](https://huggingface.co/transformers/main_classes/output.html?highlight=modeloutput#transformers.file_utils.ModelOutput)
instead of tuples in
`TransformerData.model_output and FullTransformerBatch.model_output.` For
backwards compatibility, the tuple format remains available under
`TransformerData.tensors` and `FullTransformerBatch.tensors`. See more details
in the [transformer API docs](/api/architectures#TransformerModel).
`spacy-transfomers` v1.1 also adds support for `transformer_config` settings
such as `output_attentions`. Additional output is stored under
`TransformerData.model_output`. More details are in the
[TransformerModel docs](/api/architectures#TransformerModel). The training speed
has been improved by streamlining allocations for tokenizer output and there is
new support for [mixed-precision training](/api/architectures#TransformerModel).
### New transformer package for Japanese {#pipeline-packages}
spaCy v3.2 adds a new transformer pipeline package for Japanese
[`ja_core_news_trf`](/models/ja#ja_core_news_trf), which uses the `basic`
pretokenizer instead of `mecab` to limit the number of dependencies required for
the pipeline. Thanks to Hiroshi Matsuda and the spaCy Japanese community for
their contributions!
### Pipeline and language updates {#pipeline-updates}
- All Universal Dependencies training data has been updated to v2.8.
- The Catalan data, tokenizer and lemmatizer have been updated, thanks to Carlos
Rodriguez and the Barcelona Supercomputing Center!
- The transformer pipelines are trained using spacy-transformers v1.1, with
improved IO and more options for
[model config and output](/api/architectures#TransformerModel).
- Trailing whitespace has been added as a `tok2vec` feature, improving the
performance for many components, especially fine-grained tagging and sentence
segmentation.
- The English attribute ruler patterns have been overhauled to improve
`Token.pos` and `Token.morph`.
spaCy v3.2 also features a new Irish lemmatizer, support for `noun_chunks` in
Portuguese, improved `noun_chunks` for Spanish and additional updates for
Bulgarian, Catalan, Sinhala, Tagalog, Tigrinya and Vietnamese.
## Notes about upgrading from v3.1 {#upgrading}
### Pipeline package version compatibility {#version-compat}
> #### Using legacy implementations
>
> In spaCy v3, you'll still be able to load and reference legacy implementations
> via [`spacy-legacy`](https://github.com/explosion/spacy-legacy), even if the
> components or architectures change and newer versions are available in the
> core library.
When you're loading a pipeline package trained with spaCy v3.0 or v3.1, you will
see a warning telling you that the pipeline may be incompatible. This doesn't
necessarily have to be true, but we recommend running your pipelines against
your test suite or evaluation data to make sure there are no unexpected results.
If you're using one of the [trained pipelines](/models) we provide, you should
run [`spacy download`](/api/cli#download) to update to the latest version. To
see an overview of all installed packages and their compatibility, you can run
[`spacy validate`](/api/cli#validate).
If you've trained your own custom pipeline and you've confirmed that it's still
working as expected, you can update the spaCy version requirements in the
[`meta.json`](/api/data-formats#meta):
```diff
- "spacy_version": ">=3.1.0,<3.2.0",
+ "spacy_version": ">=3.2.0,<3.3.0",
```
### Updating v3.1 configs
To update a config from spaCy v3.1 with the new v3.2 settings, run
[`init fill-config`](/api/cli#init-fill-config):
```cli
$ python -m spacy init fill-config config-v3.1.cfg config-v3.2.cfg
```
In many cases ([`spacy train`](/api/cli#train),
[`spacy.load`](/api/top-level#spacy.load)), the new defaults will be filled in
automatically, but you'll need to fill in the new settings to run
[`debug config`](/api/cli#debug) and [`debug data`](/api/cli#debug-data).
## Notes about upgrading from spacy-transformers v1.0 {#upgrading-transformers}
When you're loading a transformer pipeline package trained with
[`spacy-transformers`](https://github.com/explosion/spacy-transformers) v1.0
after upgrading to `spacy-transformers` v1.1, you'll see a warning telling you
that the pipeline may be incompatible. `spacy-transformers` v1.1 should be able
to import v1.0 `transformer` components into the new internal format with no
change in performance, but here we'd also recommend running your test suite to
verify that the pipeline still performs as expected.
If you save your pipeline with [`nlp.to_disk`](/api/language#to_disk), it will
be saved in the new v1.1 format and should be fully compatible with
`spacy-transformers` v1.1. Once you've confirmed the performance, you can update
the requirements in [`meta.json`](/api/data-formats#meta):
```diff
"requirements": [
- "spacy-transformers>=1.0.3,<1.1.0"
+ "spacy-transformers>=1.1.2,<1.2.0"
]
```
If you're using one of the [trained pipelines](/models) we provide, you should
run [`spacy download`](/api/cli#download) to update to the latest version. To
see an overview of all installed packages and their compatibility, you can run
[`spacy validate`](/api/cli#validate).

View File

@ -10,7 +10,8 @@
{ "text": "Facts & Figures", "url": "/usage/facts-figures" },
{ "text": "spaCy 101", "url": "/usage/spacy-101" },
{ "text": "New in v3.0", "url": "/usage/v3" },
{ "text": "New in v3.1", "url": "/usage/v3-1" }
{ "text": "New in v3.1", "url": "/usage/v3-1" },
{ "text": "New in v3.2", "url": "/usage/v3-2" }
]
},
{

View File

@ -3343,6 +3343,65 @@
"category": ["research", "standalone", "scientific"],
"tags": ["Text Analytics", "Coherence", "Cohesion"]
},
{
"id": "lingfeat",
"title": "LingFeat",
"slogan": "A Linguistic Feature Extraction (Text Analysis) Tool for Readability Assessment and Text Simplification",
"description": "LingFeat is a feature extraction library which currently extracts 255 linguistic features from English string input. Categories include syntax, semantics, discourse, and also traditional readability formulas. Published in EMNLP 2021.",
"github": "brucewlee/lingfeat",
"pip": "lingfeat",
"code_example": [
"from lingfeat import extractor",
"",
"",
"text = 'TAEAN, South Chungcheong Province -- Just before sunup, Lee Young-ho, a seasoned fisherman with over 30 years of experience, silently waits for boats carrying blue crabs as the season for the seafood reaches its height. Soon afterward, small and big boats sail into Sinjin Port in Taean County, South Chungcheong Province, the second-largest source of blue crab after Incheon, accounting for 29 percent of total production of the country. A crane lifts 28 boxes filled with blue crabs weighing 40 kilograms each from the boat, worth about 10 million won ($8,500). “It has been a productive fall season for crabbing here. The water temperature is a very important factor affecting crab production. They hate cold water,” Lee said. The temperature of the sea off Taean appeared to have stayed at the level where crabs become active. If the sea temperature suddenly drops, crabs go into their winter dormancy mode, burrowing into the mud and sleeping through the cold months.'",
"",
"",
"#Pass text",
"LingFeat = extractor.pass_text(text)",
"",
"",
"#Preprocess text",
"LingFeat.preprocess()",
"",
"",
"#Extract features",
"#each method returns a dictionary of the corresponding features",
"#Advanced Semantic (AdSem) Features",
"WoKF = LingFeat.WoKF_() #Wikipedia Knowledge Features",
"WBKF = LingFeat.WBKF_() #WeeBit Corpus Knowledge Features",
"OSKF = LingFeat.OSKF_() #OneStopEng Corpus Knowledge Features",
"",
"#Discourse (Disco) Features",
"EnDF = LingFeat.EnDF_() #Entity Density Features",
"EnGF = LingFeat.EnGF_() #Entity Grid Features",
"",
"#Syntactic (Synta) Features",
"PhrF = LingFeat.PhrF_() #Noun/Verb/Adj/Adv/... Phrasal Features",
"TrSF = LingFeat.TrSF_() #(Parse) Tree Structural Features",
"POSF = LingFeat.POSF_() #Noun/Verb/Adj/Adv/... Part-of-Speech Features",
"",
"#Lexico Semantic (LxSem) Features",
"TTRF = LingFeat.TTRF_() #Type Token Ratio Features",
"VarF = LingFeat.VarF_() #Noun/Verb/Adj/Adv Variation Features",
"PsyF = LingFeat.PsyF_() #Psycholinguistic Difficulty of Words (AoA Kuperman)",
"WoLF = LingFeat.WorF_() #Word Familiarity from Frequency Count (SubtlexUS)",
"",
"Shallow Traditional (ShTra) Features",
"ShaF = LingFeat.ShaF_() #Shallow Features (e.g. avg number of tokens)",
"TraF = LingFeat.TraF_() #Traditional Formulas"
],
"code_language": "python",
"thumb": "https://raw.githubusercontent.com/brucewlee/lingfeat/master/img/lingfeat_logo2.png",
"image": "https://raw.githubusercontent.com/brucewlee/lingfeat/master/img/lingfeat_logo.png",
"author": "Bruce W. Lee (이웅성)",
"author_links": {
"github": "brucewlee",
"website": "https://brucewlee.github.io/"
},
"category": ["research", "scientific"],
"tags": ["Readability", "Simplification", "Feature Extraction", "Syntax", "Discourse", "Semantics", "Lexical"]
},
{
"id": "hmrb",
"title": "Hammurabi",
@ -3518,7 +3577,22 @@
},
"category": ["pipeline", "research", "standalone"],
"tags": ["spacy", "python", "nlp", "ner"]
}
},
{
"id": "WordDumb",
"title": "WordDumb",
"slogan": "A calibre plugin that generates Word Wise and X-Ray files.",
"description": "A calibre plugin that generates Word Wise and X-Ray files then sends them to Kindle. Supports KFX, AZW3 and MOBI eBooks. X-Ray supports 18 languages.",
"github": "xxyzz/WordDumb",
"code_language": "python",
"thumb": "https://raw.githubusercontent.com/xxyzz/WordDumb/master/starfish.svg",
"image": "https://user-images.githubusercontent.com/21101839/130245435-b874f19a-7785-4093-9975-81596efc42bb.png",
"author": "xxyzz",
"author_links": {
"github": "xxyzz"
},
"category": ["standalone"]
}
],
"categories": [

View File

@ -119,8 +119,8 @@ const AlertSpace = ({ nightly, legacy }) => {
}
const navAlert = (
<Link to="/usage/v3-1" hidden>
<strong>💥 Out now:</strong> spaCy v3.1
<Link to="/usage/v3-2" hidden>
<strong>💥 Out now:</strong> spaCy v3.2
</Link>
)