Merge pull request #10100 from svlandeg/feature/master_copy

Update develop with latest from master (2)
This commit is contained in:
Sofie Van Landeghem 2022-01-20 14:29:50 +01:00 committed by GitHub
commit d2afdfefc2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
31 changed files with 331 additions and 91 deletions

View File

@ -108,8 +108,8 @@ apple =
thinc-apple-ops>=0.0.4,<1.0.0 thinc-apple-ops>=0.0.4,<1.0.0
# Language tokenizers with external dependencies # Language tokenizers with external dependencies
ja = ja =
sudachipy>=0.4.9 sudachipy>=0.5.2,!=0.6.1
sudachidict_core>=20200330 sudachidict_core>=20211220
ko = ko =
natto-py==0.9.0 natto-py==0.9.0
th = th =

View File

@ -1,3 +1,6 @@
from .errors import Errors
IOB_STRINGS = ("", "I", "O", "B")
IDS = { IDS = {
"": NULL_ATTR, "": NULL_ATTR,
@ -64,7 +67,6 @@ IDS = {
"FLAG61": FLAG61, "FLAG61": FLAG61,
"FLAG62": FLAG62, "FLAG62": FLAG62,
"FLAG63": FLAG63, "FLAG63": FLAG63,
"ID": ID, "ID": ID,
"ORTH": ORTH, "ORTH": ORTH,
"LOWER": LOWER, "LOWER": LOWER,
@ -72,7 +74,6 @@ IDS = {
"SHAPE": SHAPE, "SHAPE": SHAPE,
"PREFIX": PREFIX, "PREFIX": PREFIX,
"SUFFIX": SUFFIX, "SUFFIX": SUFFIX,
"LENGTH": LENGTH, "LENGTH": LENGTH,
"LEMMA": LEMMA, "LEMMA": LEMMA,
"POS": POS, "POS": POS,
@ -87,7 +88,7 @@ IDS = {
"SPACY": SPACY, "SPACY": SPACY,
"LANG": LANG, "LANG": LANG,
"MORPH": MORPH, "MORPH": MORPH,
"IDX": IDX "IDX": IDX,
} }
@ -109,28 +110,66 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
""" """
inty_attrs = {} inty_attrs = {}
if _do_deprecated: if _do_deprecated:
if 'F' in stringy_attrs: if "F" in stringy_attrs:
stringy_attrs["ORTH"] = stringy_attrs.pop("F") stringy_attrs["ORTH"] = stringy_attrs.pop("F")
if 'L' in stringy_attrs: if "L" in stringy_attrs:
stringy_attrs["LEMMA"] = stringy_attrs.pop("L") stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
if 'pos' in stringy_attrs: if "pos" in stringy_attrs:
stringy_attrs["TAG"] = stringy_attrs.pop("pos") stringy_attrs["TAG"] = stringy_attrs.pop("pos")
if 'morph' in stringy_attrs: if "morph" in stringy_attrs:
morphs = stringy_attrs.pop('morph') morphs = stringy_attrs.pop("morph")
if 'number' in stringy_attrs: if "number" in stringy_attrs:
stringy_attrs.pop('number') stringy_attrs.pop("number")
if 'tenspect' in stringy_attrs: if "tenspect" in stringy_attrs:
stringy_attrs.pop('tenspect') stringy_attrs.pop("tenspect")
morph_keys = [ morph_keys = [
'PunctType', 'PunctSide', 'Other', 'Degree', 'AdvType', 'Number', "PunctType",
'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss', "PunctSide",
'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType', "Other",
'Gender', 'Mood', 'Negative', 'Tense', 'Voice', 'Abbr', "Degree",
'Derivation', 'Echo', 'Foreign', 'NameType', 'NounType', 'NumForm', "AdvType",
'NumValue', 'PartType', 'Polite', 'StyleVariant', "Number",
'PronType', 'AdjType', 'Person', 'Variant', 'AdpType', "VerbForm",
'Reflex', 'Negative', 'Mood', 'Aspect', 'Case', "PronType",
'Polarity', 'PrepCase', 'Animacy' # U20 "Aspect",
"Tense",
"PartType",
"Poss",
"Hyph",
"ConjType",
"NumType",
"Foreign",
"VerbType",
"NounType",
"Gender",
"Mood",
"Negative",
"Tense",
"Voice",
"Abbr",
"Derivation",
"Echo",
"Foreign",
"NameType",
"NounType",
"NumForm",
"NumValue",
"PartType",
"Polite",
"StyleVariant",
"PronType",
"AdjType",
"Person",
"Variant",
"AdpType",
"Reflex",
"Negative",
"Mood",
"Aspect",
"Case",
"Polarity",
"PrepCase",
"Animacy", # U20
] ]
for key in morph_keys: for key in morph_keys:
if key in stringy_attrs: if key in stringy_attrs:
@ -142,8 +181,13 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
for name, value in stringy_attrs.items(): for name, value in stringy_attrs.items():
int_key = intify_attr(name) int_key = intify_attr(name)
if int_key is not None: if int_key is not None:
if int_key == ENT_IOB:
if value in IOB_STRINGS:
value = IOB_STRINGS.index(value)
elif isinstance(value, str):
raise ValueError(Errors.E1025.format(value=value))
if strings_map is not None and isinstance(value, str): if strings_map is not None and isinstance(value, str):
if hasattr(strings_map, 'add'): if hasattr(strings_map, "add"):
value = strings_map.add(value) value = strings_map.add(value)
else: else:
value = strings_map[value] value = strings_map[value]

View File

@ -14,7 +14,7 @@ from ..training.initialize import get_sourced_components
from ..schemas import ConfigSchemaTraining from ..schemas import ConfigSchemaTraining
from ..pipeline._parser_internals import nonproj from ..pipeline._parser_internals import nonproj
from ..pipeline._parser_internals.nonproj import DELIMITER from ..pipeline._parser_internals.nonproj import DELIMITER
from ..pipeline import Morphologizer from ..pipeline import Morphologizer, SpanCategorizer
from ..morphology import Morphology from ..morphology import Morphology
from ..language import Language from ..language import Language
from ..util import registry, resolve_dot_names from ..util import registry, resolve_dot_names
@ -699,8 +699,34 @@ def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
return count return count
def _get_labels_from_model(nlp: Language, pipe_name: str) -> Set[str]: def _get_labels_from_model(
if pipe_name not in nlp.pipe_names: nlp: Language, factory_name: str
return set() ) -> Set[str]:
pipe_names = [
pipe_name
for pipe_name in nlp.pipe_names
if nlp.get_pipe_meta(pipe_name).factory == factory_name
]
labels: Set[str] = set()
for pipe_name in pipe_names:
pipe = nlp.get_pipe(pipe_name) pipe = nlp.get_pipe(pipe_name)
return set(pipe.labels) labels.update(pipe.labels)
return labels
def _get_labels_from_spancat(
nlp: Language
) -> Dict[str, Set[str]]:
pipe_names = [
pipe_name
for pipe_name in nlp.pipe_names
if nlp.get_pipe_meta(pipe_name).factory == "spancat"
]
labels: Dict[str, Set[str]] = {}
for pipe_name in pipe_names:
pipe = nlp.get_pipe(pipe_name)
assert isinstance(pipe, SpanCategorizer)
if pipe.key not in labels:
labels[pipe.key] = set()
labels[pipe.key].update(pipe.labels)
return labels

View File

@ -18,7 +18,7 @@ DEFAULT_LABEL_COLORS = {
"LOC": "#ff9561", "LOC": "#ff9561",
"PERSON": "#aa9cfc", "PERSON": "#aa9cfc",
"NORP": "#c887fb", "NORP": "#c887fb",
"FACILITY": "#9cc9cc", "FAC": "#9cc9cc",
"EVENT": "#ffeb80", "EVENT": "#ffeb80",
"LAW": "#ff8197", "LAW": "#ff8197",
"LANGUAGE": "#ff8197", "LANGUAGE": "#ff8197",

View File

@ -888,9 +888,12 @@ class Errors(metaclass=ErrorsWithCodes):
E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. " E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
"Non-UD tags should use the `tag` property.") "Non-UD tags should use the `tag` property.")
E1022 = ("Words must be of type str or int, but input is of type '{wtype}'") E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't exist.") E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't "
E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler patterns.") "exist.")
E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler "
"patterns.")
E1025 = ("Cannot intify the value '{value}' as an IOB string. The only "
"supported values are: 'I', 'O', 'B' and ''")
# Deprecated model shortcuts, only used in errors and warnings # Deprecated model shortcuts, only used in errors and warnings

View File

@ -1285,9 +1285,9 @@ class Language:
) )
except IOError: except IOError:
raise IOError(Errors.E884.format(vectors=I["vectors"])) raise IOError(Errors.E884.format(vectors=I["vectors"]))
if self.vocab.vectors.data.shape[1] >= 1: if self.vocab.vectors.shape[1] >= 1:
ops = get_current_ops() ops = get_current_ops()
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) self.vocab.vectors.to_ops(ops)
if hasattr(self.tokenizer, "initialize"): if hasattr(self.tokenizer, "initialize"):
tok_settings = validate_init_settings( tok_settings = validate_init_settings(
self.tokenizer.initialize, # type: ignore[union-attr] self.tokenizer.initialize, # type: ignore[union-attr]
@ -1332,8 +1332,8 @@ class Language:
DOCS: https://spacy.io/api/language#resume_training DOCS: https://spacy.io/api/language#resume_training
""" """
ops = get_current_ops() ops = get_current_ops()
if self.vocab.vectors.data.shape[1] >= 1: if self.vocab.vectors.shape[1] >= 1:
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) self.vocab.vectors.to_ops(ops)
for name, proc in self.pipeline: for name, proc in self.pipeline:
if hasattr(proc, "_rehearsal_model"): if hasattr(proc, "_rehearsal_model"):
proc._rehearsal_model = deepcopy(proc.model) # type: ignore[attr-defined] proc._rehearsal_model = deepcopy(proc.model) # type: ignore[attr-defined]

View File

@ -130,7 +130,9 @@ cdef class Lexeme:
return 0.0 return 0.0
vector = self.vector vector = self.vector
xp = get_array_module(vector) xp = get_array_module(vector)
return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)) result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
return result.item()
@property @property
def has_vector(self): def has_vector(self):

View File

@ -18,7 +18,7 @@ from ..tokens.doc cimport Doc, get_token_attr_for_matcher
from ..tokens.span cimport Span from ..tokens.span cimport Span
from ..tokens.token cimport Token from ..tokens.token cimport Token
from ..tokens.morphanalysis cimport MorphAnalysis from ..tokens.morphanalysis cimport MorphAnalysis
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB
from ..schemas import validate_token_pattern from ..schemas import validate_token_pattern
from ..errors import Errors, MatchPatternError, Warnings from ..errors import Errors, MatchPatternError, Warnings
@ -798,6 +798,9 @@ def _get_attr_values(spec, string_store):
attr = "SENT_START" attr = "SENT_START"
attr = IDS.get(attr) attr = IDS.get(attr)
if isinstance(value, str): if isinstance(value, str):
if attr == ENT_IOB and value in Token.iob_strings():
value = Token.iob_strings().index(value)
else:
value = string_store.add(value) value = string_store.add(value)
elif isinstance(value, bool): elif isinstance(value, bool):
value = int(value) value = int(value)

View File

@ -23,7 +23,7 @@ def create_pretrain_vectors(
maxout_pieces: int, hidden_size: int, loss: str maxout_pieces: int, hidden_size: int, loss: str
) -> Callable[["Vocab", Model], Model]: ) -> Callable[["Vocab", Model], Model]:
def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model: def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model:
if vocab.vectors.data.shape[1] == 0: if vocab.vectors.shape[1] == 0:
raise ValueError(Errors.E875) raise ValueError(Errors.E875)
model = build_cloze_multi_task_model( model = build_cloze_multi_task_model(
vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
@ -116,7 +116,7 @@ def build_multi_task_model(
def build_cloze_multi_task_model( def build_cloze_multi_task_model(
vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int
) -> Model: ) -> Model:
nO = vocab.vectors.data.shape[1] nO = vocab.vectors.shape[1]
output_layer = chain( output_layer = chain(
cast(Model[List["Floats2d"], Floats2d], list2array()), cast(Model[List["Floats2d"], Floats2d], list2array()),
Maxout( Maxout(

View File

@ -94,7 +94,7 @@ def init(
nM = model.get_dim("nM") if model.has_dim("nM") else None nM = model.get_dim("nM") if model.has_dim("nM") else None
nO = model.get_dim("nO") if model.has_dim("nO") else None nO = model.get_dim("nO") if model.has_dim("nO") else None
if X is not None and len(X): if X is not None and len(X):
nM = X[0].vocab.vectors.data.shape[1] nM = X[0].vocab.vectors.shape[1]
if Y is not None: if Y is not None:
nO = Y.data.shape[1] nO = Y.data.shape[1]

View File

@ -1,5 +1,6 @@
from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
from typing import Iterable, TypeVar, TYPE_CHECKING from typing import Iterable, TypeVar, TYPE_CHECKING
from .compat import Literal
from enum import Enum from enum import Enum
from pydantic import BaseModel, Field, ValidationError, validator, create_model from pydantic import BaseModel, Field, ValidationError, validator, create_model
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
@ -209,6 +210,7 @@ NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat]
UnderscoreValue = Union[ UnderscoreValue = Union[
TokenPatternString, TokenPatternNumber, str, int, float, list, bool TokenPatternString, TokenPatternNumber, str, int, float, list, bool
] ]
IobValue = Literal["", "I", "O", "B", 0, 1, 2, 3]
class TokenPattern(BaseModel): class TokenPattern(BaseModel):
@ -222,6 +224,7 @@ class TokenPattern(BaseModel):
lemma: Optional[StringValue] = None lemma: Optional[StringValue] = None
shape: Optional[StringValue] = None shape: Optional[StringValue] = None
ent_type: Optional[StringValue] = None ent_type: Optional[StringValue] = None
ent_iob: Optional[IobValue] = None
ent_id: Optional[StringValue] = None ent_id: Optional[StringValue] = None
ent_kb_id: Optional[StringValue] = None ent_kb_id: Optional[StringValue] = None
norm: Optional[StringValue] = None norm: Optional[StringValue] = None

View File

@ -567,6 +567,7 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
"Merging the docs is fun.", "Merging the docs is fun.",
"", "",
"They don't think alike. ", "They don't think alike. ",
"",
"Another doc.", "Another doc.",
] ]
en_texts_without_empty = [t for t in en_texts if len(t)] en_texts_without_empty = [t for t in en_texts if len(t)]
@ -574,9 +575,9 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
en_docs = [en_tokenizer(text) for text in en_texts] en_docs = [en_tokenizer(text) for text in en_texts]
en_docs[0].spans["group"] = [en_docs[0][1:4]] en_docs[0].spans["group"] = [en_docs[0][1:4]]
en_docs[2].spans["group"] = [en_docs[2][1:4]] en_docs[2].spans["group"] = [en_docs[2][1:4]]
en_docs[3].spans["group"] = [en_docs[3][0:1]] en_docs[4].spans["group"] = [en_docs[4][0:1]]
span_group_texts = sorted( span_group_texts = sorted(
[en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text] [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[4][0:1].text]
) )
de_doc = de_tokenizer(de_text) de_doc = de_tokenizer(de_text)
Token.set_extension("is_ambiguous", default=False) Token.set_extension("is_ambiguous", default=False)

View File

@ -1,4 +1,5 @@
import pytest import pytest
from spacy.attrs import intify_attrs, ENT_IOB
from spacy.attrs import IS_ALPHA, LEMMA, NORM, ORTH, intify_attrs from spacy.attrs import IS_ALPHA, LEMMA, NORM, ORTH, intify_attrs
from spacy.lang.en.stop_words import STOP_WORDS from spacy.lang.en.stop_words import STOP_WORDS
@ -33,6 +34,38 @@ def test_attrs_do_deprecated(text):
assert int_attrs == {ORTH: 10, IS_ALPHA: True} assert int_attrs == {ORTH: 10, IS_ALPHA: True}
def test_attrs_ent_iob_intify():
int_attrs = intify_attrs({"ENT_IOB": ""})
assert int_attrs == {ENT_IOB: 0}
int_attrs = intify_attrs({"ENT_IOB": "I"})
assert int_attrs == {ENT_IOB: 1}
int_attrs = intify_attrs({"ENT_IOB": "O"})
assert int_attrs == {ENT_IOB: 2}
int_attrs = intify_attrs({"ENT_IOB": "B"})
assert int_attrs == {ENT_IOB: 3}
int_attrs = intify_attrs({ENT_IOB: ""})
assert int_attrs == {ENT_IOB: 0}
int_attrs = intify_attrs({ENT_IOB: "I"})
assert int_attrs == {ENT_IOB: 1}
int_attrs = intify_attrs({ENT_IOB: "O"})
assert int_attrs == {ENT_IOB: 2}
int_attrs = intify_attrs({ENT_IOB: "B"})
assert int_attrs == {ENT_IOB: 3}
with pytest.raises(ValueError):
int_attrs = intify_attrs({"ENT_IOB": "XX"})
with pytest.raises(ValueError):
int_attrs = intify_attrs({ENT_IOB: "XX"})
@pytest.mark.parametrize("text,match", [(",", True), (" ", False), ("a", False)]) @pytest.mark.parametrize("text,match", [(",", True), (" ", False), ("a", False)])
def test_lex_attrs_is_punct(text, match): def test_lex_attrs_is_punct(text, match):
assert is_punct(text) == match assert is_punct(text) == match

View File

@ -642,3 +642,30 @@ def test_matcher_no_zero_length(en_vocab):
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]]) matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]])
assert len(matcher(doc)) == 0 assert len(matcher(doc)) == 0
def test_matcher_ent_iob_key(en_vocab):
"""Test that patterns with ent_iob works correctly."""
matcher = Matcher(en_vocab)
matcher.add("Rule", [[{"ENT_IOB": "I"}]])
doc1 = Doc(en_vocab, words=["I", "visited", "New", "York", "and", "California"])
doc1.ents = [Span(doc1, 2, 4, label="GPE"), Span(doc1, 5, 6, label="GPE")]
doc2 = Doc(en_vocab, words=["I", "visited", "my", "friend", "Alicia"])
doc2.ents = [Span(doc2, 4, 5, label="PERSON")]
matches1 = [doc1[start:end].text for _, start, end in matcher(doc1)]
matches2 = [doc2[start:end].text for _, start, end in matcher(doc2)]
assert len(matches1) == 1
assert matches1[0] == "York"
assert len(matches2) == 0
matcher = Matcher(en_vocab) # Test iob pattern with operators
matcher.add("Rule", [[{"ENT_IOB": "I", "OP": "+"}]])
doc = Doc(
en_vocab, words=["I", "visited", "my", "friend", "Anna", "Maria", "Esperanza"]
)
doc.ents = [Span(doc, 4, 7, label="PERSON")]
matches = [doc[start:end].text for _, start, end in matcher(doc)]
assert len(matches) == 3
assert matches[0] == "Maria"
assert matches[1] == "Maria Esperanza"
assert matches[2] == "Esperanza"

View File

@ -12,6 +12,7 @@ TEST_PATTERNS = [
([{"IS_PUNCT": True, "OP": "$"}], 1, 1), ([{"IS_PUNCT": True, "OP": "$"}], 1, 1),
([{"_": "foo"}], 1, 1), ([{"_": "foo"}], 1, 1),
('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1), ('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1),
([{"ENT_IOB": "foo"}], 1, 1),
([1, 2, 3], 3, 1), ([1, 2, 3], 3, 1),
# Bad patterns flagged outside of Matcher # Bad patterns flagged outside of Matcher
([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0), # prev: (1, 0) ([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0), # prev: (1, 0)

View File

@ -79,7 +79,8 @@ def test_explicit_labels():
nlp.initialize() nlp.initialize()
assert spancat.labels == ("PERSON", "LOC") assert spancat.labels == ("PERSON", "LOC")
#TODO figure out why this is flaky
@pytest.mark.skip(reason="Test is unreliable for unknown reason")
def test_doc_gc(): def test_doc_gc():
# If the Doc object is garbage collected, the spans won't be functional afterwards # If the Doc object is garbage collected, the spans won't be functional afterwards
nlp = Language() nlp = Language()
@ -97,6 +98,7 @@ def test_doc_gc():
assert isinstance(spangroups, SpanGroups) assert isinstance(spangroups, SpanGroups)
for key, spangroup in spangroups.items(): for key, spangroup in spangroups.items():
assert isinstance(spangroup, SpanGroup) assert isinstance(spangroup, SpanGroup)
# XXX This fails with length 0 sometimes
assert len(spangroup) > 0 assert len(spangroup) > 0
with pytest.raises(RuntimeError): with pytest.raises(RuntimeError):
span = spangroup[0] span = spangroup[0]

View File

@ -12,6 +12,8 @@ from spacy.cli._util import is_subpath_of, load_project_config
from spacy.cli._util import parse_config_overrides, string_to_list from spacy.cli._util import parse_config_overrides, string_to_list
from spacy.cli._util import substitute_project_variables from spacy.cli._util import substitute_project_variables
from spacy.cli._util import validate_project_commands from spacy.cli._util import validate_project_commands
from spacy.cli.debug_data import _get_labels_from_model
from spacy.cli.debug_data import _get_labels_from_spancat
from spacy.cli.download import get_compatibility, get_version from spacy.cli.download import get_compatibility, get_version
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
from spacy.cli.package import get_third_party_dependencies from spacy.cli.package import get_third_party_dependencies
@ -665,3 +667,28 @@ def test_get_third_party_dependencies():
) )
def test_is_subpath_of(parent, child, expected): def test_is_subpath_of(parent, child, expected):
assert is_subpath_of(parent, child) == expected assert is_subpath_of(parent, child) == expected
@pytest.mark.slow
@pytest.mark.parametrize(
"factory_name,pipe_name",
[
("ner", "ner"),
("ner", "my_ner"),
("spancat", "spancat"),
("spancat", "my_spancat"),
],
)
def test_get_labels_from_model(factory_name, pipe_name):
labels = ("A", "B")
nlp = English()
pipe = nlp.add_pipe(factory_name, name=pipe_name)
for label in labels:
pipe.add_label(label)
nlp.initialize()
assert nlp.get_pipe(pipe_name).labels == labels
if factory_name == "spancat":
assert _get_labels_from_spancat(nlp)[pipe.key] == set(labels)
else:
assert _get_labels_from_model(nlp, factory_name) == set(labels)

View File

@ -35,6 +35,7 @@ def test_vectors_similarity_LL(vocab, vectors):
assert lex1.vector_norm != 0 assert lex1.vector_norm != 0
assert lex2.vector_norm != 0 assert lex2.vector_norm != 0
assert lex1.vector[0] != lex2.vector[0] and lex1.vector[1] != lex2.vector[1] assert lex1.vector[0] != lex2.vector[0] and lex1.vector[1] != lex2.vector[1]
assert isinstance(lex1.similarity(lex2), float)
assert numpy.isclose(lex1.similarity(lex2), get_cosine(vec1, vec2)) assert numpy.isclose(lex1.similarity(lex2), get_cosine(vec1, vec2))
assert numpy.isclose(lex2.similarity(lex2), lex1.similarity(lex1)) assert numpy.isclose(lex2.similarity(lex2), lex1.similarity(lex1))
@ -47,25 +48,46 @@ def test_vectors_similarity_TT(vocab, vectors):
assert doc[0].vector_norm != 0 assert doc[0].vector_norm != 0
assert doc[1].vector_norm != 0 assert doc[1].vector_norm != 0
assert doc[0].vector[0] != doc[1].vector[0] and doc[0].vector[1] != doc[1].vector[1] assert doc[0].vector[0] != doc[1].vector[0] and doc[0].vector[1] != doc[1].vector[1]
assert isinstance(doc[0].similarity(doc[1]), float)
assert numpy.isclose(doc[0].similarity(doc[1]), get_cosine(vec1, vec2)) assert numpy.isclose(doc[0].similarity(doc[1]), get_cosine(vec1, vec2))
assert numpy.isclose(doc[1].similarity(doc[0]), doc[0].similarity(doc[1])) assert numpy.isclose(doc[1].similarity(doc[0]), doc[0].similarity(doc[1]))
def test_vectors_similarity_SS(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
doc = Doc(vocab, words=[word1, word2])
assert isinstance(doc[0:1].similarity(doc[0:2]), float)
assert doc[0:1].similarity(doc[0:2]) == doc[0:2].similarity(doc[0:1])
def test_vectors_similarity_DD(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
doc1 = Doc(vocab, words=[word1, word2])
doc2 = Doc(vocab, words=[word2, word1])
assert isinstance(doc1.similarity(doc2), float)
assert doc1.similarity(doc2) == doc2.similarity(doc1)
def test_vectors_similarity_TD(vocab, vectors): def test_vectors_similarity_TD(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors [(word1, vec1), (word2, vec2)] = vectors
doc = Doc(vocab, words=[word1, word2]) doc = Doc(vocab, words=[word1, word2])
with pytest.warns(UserWarning): with pytest.warns(UserWarning):
assert isinstance(doc.similarity(doc[0]), float)
assert isinstance(doc[0].similarity(doc), float)
assert doc.similarity(doc[0]) == doc[0].similarity(doc) assert doc.similarity(doc[0]) == doc[0].similarity(doc)
def test_vectors_similarity_DS(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
doc = Doc(vocab, words=[word1, word2])
assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
def test_vectors_similarity_TS(vocab, vectors): def test_vectors_similarity_TS(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors [(word1, vec1), (word2, vec2)] = vectors
doc = Doc(vocab, words=[word1, word2]) doc = Doc(vocab, words=[word1, word2])
with pytest.warns(UserWarning): with pytest.warns(UserWarning):
assert isinstance(doc[:2].similarity(doc[0]), float)
assert isinstance(doc[0].similarity(doc[-2]), float)
assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2]) assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2])
def test_vectors_similarity_DS(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
doc = Doc(vocab, words=[word1, word2])
assert isinstance(doc.similarity(doc[:2]), float)
assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)

View File

@ -421,7 +421,7 @@ def test_vector_is_oov():
def test_init_vectors_unset(): def test_init_vectors_unset():
v = Vectors(shape=(10, 10)) v = Vectors(shape=(10, 10))
assert v.is_full is False assert v.is_full is False
assert v.data.shape == (10, 10) assert v.shape == (10, 10)
with pytest.raises(ValueError): with pytest.raises(ValueError):
v = Vectors(shape=(10, 10), mode="floret") v = Vectors(shape=(10, 10), mode="floret")
@ -514,7 +514,7 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str):
# rows: 2 rows per ngram # rows: 2 rows per ngram
rows = OPS.xp.asarray( rows = OPS.xp.asarray(
[ [
h % nlp.vocab.vectors.data.shape[0] h % nlp.vocab.vectors.shape[0]
for ngram in ngrams for ngram in ngrams
for h in nlp.vocab.vectors._get_ngram_hashes(ngram) for h in nlp.vocab.vectors._get_ngram_hashes(ngram)
], ],
@ -544,17 +544,17 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str):
# an empty key returns 0s # an empty key returns 0s
assert_equal( assert_equal(
OPS.to_numpy(nlp.vocab[""].vector), OPS.to_numpy(nlp.vocab[""].vector),
numpy.zeros((nlp.vocab.vectors.data.shape[0],)), numpy.zeros((nlp.vocab.vectors.shape[0],)),
) )
# an empty batch returns 0s # an empty batch returns 0s
assert_equal( assert_equal(
OPS.to_numpy(nlp.vocab.vectors.get_batch([""])), OPS.to_numpy(nlp.vocab.vectors.get_batch([""])),
numpy.zeros((1, nlp.vocab.vectors.data.shape[0])), numpy.zeros((1, nlp.vocab.vectors.shape[0])),
) )
# an empty key within a batch returns 0s # an empty key within a batch returns 0s
assert_equal( assert_equal(
OPS.to_numpy(nlp.vocab.vectors.get_batch(["a", "", "b"])[1]), OPS.to_numpy(nlp.vocab.vectors.get_batch(["a", "", "b"])[1]),
numpy.zeros((nlp.vocab.vectors.data.shape[0],)), numpy.zeros((nlp.vocab.vectors.shape[0],)),
) )
# the loaded ngram vector table cannot be modified # the loaded ngram vector table cannot be modified

View File

@ -616,7 +616,7 @@ cdef class Doc:
""" """
if "has_vector" in self.user_hooks: if "has_vector" in self.user_hooks:
return self.user_hooks["has_vector"](self) return self.user_hooks["has_vector"](self)
elif self.vocab.vectors.data.size: elif self.vocab.vectors.size:
return True return True
elif self.tensor.size: elif self.tensor.size:
return True return True
@ -641,7 +641,7 @@ cdef class Doc:
if not len(self): if not len(self):
self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f") self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f")
return self._vector return self._vector
elif self.vocab.vectors.data.size > 0: elif self.vocab.vectors.size > 0:
self._vector = sum(t.vector for t in self) / len(self) self._vector = sum(t.vector for t in self) / len(self)
return self._vector return self._vector
elif self.tensor.size > 0: elif self.tensor.size > 0:
@ -1183,7 +1183,7 @@ cdef class Doc:
token_offset = -1 token_offset = -1
for doc in docs[:-1]: for doc in docs[:-1]:
token_offset += len(doc) token_offset += len(doc)
if not (len(doc) > 0 and doc[-1].is_space): if len(doc) > 0 and not doc[-1].is_space:
concat_spaces[token_offset] = True concat_spaces[token_offset] = True
concat_array = numpy.concatenate(arrays) concat_array = numpy.concatenate(arrays)

View File

@ -352,7 +352,9 @@ cdef class Span:
return 0.0 return 0.0
vector = self.vector vector = self.vector
xp = get_array_module(vector) xp = get_array_module(vector)
return xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm) result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
return result.item()
cpdef np.ndarray to_array(self, object py_attr_ids): cpdef np.ndarray to_array(self, object py_attr_ids):
"""Given a list of M attribute IDs, export the tokens to a numpy """Given a list of M attribute IDs, export the tokens to a numpy
@ -485,7 +487,7 @@ cdef class Span:
""" """
if "has_vector" in self.doc.user_span_hooks: if "has_vector" in self.doc.user_span_hooks:
return self.doc.user_span_hooks["has_vector"](self) return self.doc.user_span_hooks["has_vector"](self)
elif self.vocab.vectors.data.size > 0: elif self.vocab.vectors.size > 0:
return any(token.has_vector for token in self) return any(token.has_vector for token in self)
elif self.doc.tensor.size > 0: elif self.doc.tensor.size > 0:
return True return True

View File

@ -20,6 +20,7 @@ from .doc cimport set_children_from_heads
from .. import parts_of_speech from .. import parts_of_speech
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from ..attrs import IOB_STRINGS
from .underscore import Underscore, get_ext_args from .underscore import Underscore, get_ext_args
@ -209,7 +210,9 @@ cdef class Token:
return 0.0 return 0.0
vector = self.vector vector = self.vector
xp = get_array_module(vector) xp = get_array_module(vector)
return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)) result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
return result.item()
def has_morph(self): def has_morph(self):
"""Check whether the token has annotated morph information. """Check whether the token has annotated morph information.
@ -743,7 +746,7 @@ cdef class Token:
@classmethod @classmethod
def iob_strings(cls): def iob_strings(cls):
return ("", "I", "O", "B") return IOB_STRINGS
@property @property
def ent_iob_(self): def ent_iob_(self):

View File

@ -164,7 +164,7 @@ def load_vectors_into_model(
len(vectors_nlp.vocab.vectors.keys()) == 0 len(vectors_nlp.vocab.vectors.keys()) == 0
and vectors_nlp.vocab.vectors.mode != VectorsMode.floret and vectors_nlp.vocab.vectors.mode != VectorsMode.floret
) or ( ) or (
vectors_nlp.vocab.vectors.data.shape[0] == 0 vectors_nlp.vocab.vectors.shape[0] == 0
and vectors_nlp.vocab.vectors.mode == VectorsMode.floret and vectors_nlp.vocab.vectors.mode == VectorsMode.floret
): ):
logger.warning(Warnings.W112.format(name=name)) logger.warning(Warnings.W112.format(name=name))

View File

@ -10,7 +10,7 @@ from typing import cast
import warnings import warnings
from enum import Enum from enum import Enum
import srsly import srsly
from thinc.api import get_array_module, get_current_ops from thinc.api import Ops, get_array_module, get_current_ops
from thinc.backends import get_array_ops from thinc.backends import get_array_ops
from thinc.types import Floats2d from thinc.types import Floats2d
@ -146,7 +146,7 @@ cdef class Vectors:
DOCS: https://spacy.io/api/vectors#size DOCS: https://spacy.io/api/vectors#size
""" """
return self.data.shape[0] * self.data.shape[1] return self.data.size
@property @property
def is_full(self): def is_full(self):
@ -517,6 +517,9 @@ cdef class Vectors:
for i in range(len(queries)) ], dtype="uint64") for i in range(len(queries)) ], dtype="uint64")
return (keys, best_rows, scores) return (keys, best_rows, scores)
def to_ops(self, ops: Ops):
self.data = ops.asarray(self.data)
def _get_cfg(self): def _get_cfg(self):
if self.mode == Mode.default: if self.mode == Mode.default:
return { return {

View File

@ -283,7 +283,7 @@ cdef class Vocab:
@property @property
def vectors_length(self): def vectors_length(self):
return self.vectors.data.shape[1] return self.vectors.shape[1]
def reset_vectors(self, *, width=None, shape=None): def reset_vectors(self, *, width=None, shape=None):
"""Drop the current vector table. Because all vectors must be the same """Drop the current vector table. Because all vectors must be the same
@ -294,7 +294,7 @@ cdef class Vocab:
elif shape is not None: elif shape is not None:
self.vectors = Vectors(strings=self.strings, shape=shape) self.vectors = Vectors(strings=self.strings, shape=shape)
else: else:
width = width if width is not None else self.vectors.data.shape[1] width = width if width is not None else self.vectors.shape[1]
self.vectors = Vectors(strings=self.strings, shape=(self.vectors.shape[0], width)) self.vectors = Vectors(strings=self.strings, shape=(self.vectors.shape[0], width))
def prune_vectors(self, nr_row, batch_size=1024): def prune_vectors(self, nr_row, batch_size=1024):

View File

@ -99,9 +99,9 @@ be a token pattern (list) or a phrase pattern (string). For example:
## EntityRuler.initialize {#initialize tag="method" new="3"} ## EntityRuler.initialize {#initialize tag="method" new="3"}
Initialize the component with data and used before training to load in rules Initialize the component with data and used before training to load in rules
from a file. This method is typically called by from a [pattern file](/usage/rule-based-matching/#entityruler-files). This method
[`Language.initialize`](/api/language#initialize) and lets you customize is typically called by [`Language.initialize`](/api/language#initialize) and
arguments it receives via the lets you customize arguments it receives via the
[`[initialize.components]`](/api/data-formats#config-initialize) block in the [`[initialize.components]`](/api/data-formats#config-initialize) block in the
config. config.

View File

@ -44,6 +44,7 @@ rule-based matching are:
| `SPACY` | Token has a trailing space. ~~bool~~ | | `SPACY` | Token has a trailing space. ~~bool~~ |
|  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~ | |  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~ |
| `ENT_TYPE` | The token's entity label. ~~str~~ | | `ENT_TYPE` | The token's entity label. ~~str~~ |
| `ENT_IOB` | The IOB part of the token's entity tag. ~~str~~ |
| `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ | | `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ |
| `ENT_KB_ID` | The token's entity knowledge base ID (`ent_kb_id`). ~~str~~ | | `ENT_KB_ID` | The token's entity knowledge base ID (`ent_kb_id`). ~~str~~ |
| `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ | | `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |

View File

@ -371,6 +371,23 @@ Get the vectors for the provided keys efficiently as a batch.
| ------ | --------------------------------------- | | ------ | --------------------------------------- |
| `keys` | The keys. ~~Iterable[Union[int, str]]~~ | | `keys` | The keys. ~~Iterable[Union[int, str]]~~ |
## Vectors.to_ops {#to_ops tag="method"}
Change the embedding matrix to use different Thinc ops.
> #### Example
>
> ```python
> from thinc.api import NumpyOps
>
> vectors.to_ops(NumpyOps())
>
> ```
| Name | Description |
|-------|----------------------------------------------------------|
| `ops` | The Thinc ops to switch the embedding matrix to. ~~Ops~~ |
## Vectors.to_disk {#to_disk tag="method"} ## Vectors.to_disk {#to_disk tag="method"}
Save the current state to a directory. Save the current state to a directory.

View File

@ -1479,7 +1479,7 @@ especially useful it you want to pass in a string instead of calling
### Example: Pipeline component for GPE entities and country meta data via a REST API {#component-example3} ### Example: Pipeline component for GPE entities and country meta data via a REST API {#component-example3}
This example shows the implementation of a pipeline component that fetches This example shows the implementation of a pipeline component that fetches
country meta data via the [REST Countries API](https://restcountries.eu), sets country meta data via the [REST Countries API](https://restcountries.com), sets
entity annotations for countries and sets custom attributes on the `Doc` and entity annotations for countries and sets custom attributes on the `Doc` and
`Span` for example, the capital, latitude/longitude coordinates and even the `Span` for example, the capital, latitude/longitude coordinates and even the
country flag. country flag.
@ -1495,7 +1495,7 @@ from spacy.tokens import Doc, Span, Token
@Language.factory("rest_countries") @Language.factory("rest_countries")
class RESTCountriesComponent: class RESTCountriesComponent:
def __init__(self, nlp, name, label="GPE"): def __init__(self, nlp, name, label="GPE"):
r = requests.get("https://restcountries.eu/rest/v2/all") r = requests.get("https://restcountries.com/v2/all")
r.raise_for_status() # make sure requests raises an error if it fails r.raise_for_status() # make sure requests raises an error if it fails
countries = r.json() countries = r.json()
# Convert API response to dict keyed by country name for easy lookup # Convert API response to dict keyed by country name for easy lookup

View File

@ -1770,9 +1770,9 @@
"title": "Applied Language Technology", "title": "Applied Language Technology",
"slogan": "NLP for newcomers using spaCy and Stanza", "slogan": "NLP for newcomers using spaCy and Stanza",
"description": "These learning materials provide an introduction to applied language technology for audiences who are unfamiliar with language technology and programming. The learning materials assume no previous knowledge of the Python programming language.", "description": "These learning materials provide an introduction to applied language technology for audiences who are unfamiliar with language technology and programming. The learning materials assume no previous knowledge of the Python programming language.",
"url": "https://applied-language-technology.readthedocs.io/", "url": "https://applied-language-technology.mooc.fi",
"image": "https://www.mv.helsinki.fi/home/thiippal/images/applt-preview.jpg", "image": "https://www.mv.helsinki.fi/home/thiippal/images/applt-preview.jpg",
"thumb": "https://applied-language-technology.readthedocs.io/en/latest/_static/logo.png", "thumb": "https://www.mv.helsinki.fi/home/thiippal/images/applt-logo.png",
"author": "Tuomo Hiippala", "author": "Tuomo Hiippala",
"author_links": { "author_links": {
"twitter": "tuomo_h", "twitter": "tuomo_h",

View File

@ -113,8 +113,7 @@ const QuickstartInstall = ({ id, title }) => {
{ {
id: 'venv', id: 'venv',
title: 'virtual env', title: 'virtual env',
help: help: 'Use a virtual environment',
'Use a virtual environment and install spaCy into a user directory',
}, },
{ {
id: 'train', id: 'train',
@ -165,27 +164,51 @@ const QuickstartInstall = ({ id, title }) => {
setters={setters} setters={setters}
showDropdown={showDropdown} showDropdown={showDropdown}
> >
<QS config="venv">python -m venv .env</QS> <QS package="pip" config="venv">
<QS config="venv" os="mac"> python -m venv .env
</QS>
<QS package="pip" config="venv" os="mac">
source .env/bin/activate source .env/bin/activate
</QS> </QS>
<QS config="venv" os="linux"> <QS package="pip" config="venv" os="linux">
source .env/bin/activate source .env/bin/activate
</QS> </QS>
<QS config="venv" os="windows"> <QS package="pip" config="venv" os="windows">
.env\Scripts\activate .env\Scripts\activate
</QS> </QS>
<QS package="source" config="venv">
python -m venv .env
</QS>
<QS package="source" config="venv" os="mac">
source .env/bin/activate
</QS>
<QS package="source" config="venv" os="linux">
source .env/bin/activate
</QS>
<QS package="source" config="venv" os="windows">
.env\Scripts\activate
</QS>
<QS package="conda" config="venv">
conda create -n venv
</QS>
<QS package="conda" config="venv">
conda activate venv
</QS>
<QS package="pip">pip install -U pip setuptools wheel</QS> <QS package="pip">pip install -U pip setuptools wheel</QS>
<QS package="source">pip install -U pip setuptools wheel</QS> <QS package="source">pip install -U pip setuptools wheel</QS>
<QS package="pip"> <QS package="pip">
pip install -U {pkg} {pipExtras
{pipExtras && `[${pipExtras}]`} ? `pip install -U '${pkg}[${pipExtras}]'`
: `pip install -U ${pkg}`}
{nightly ? ' --pre' : ''} {nightly ? ' --pre' : ''}
</QS> </QS>
<QS package="conda">conda install -c conda-forge spacy</QS> <QS package="conda">conda install -c conda-forge spacy</QS>
<QS package="conda" hardware="gpu"> <QS package="conda" hardware="gpu">
conda install -c conda-forge cupy conda install -c conda-forge cupy
</QS> </QS>
<QS package="conda" config="train">
conda install -c conda-forge spacy-transformers
</QS>
<QS package="source"> <QS package="source">
git clone https://github.com/{repo} git clone https://github.com/{repo}
{nightly ? ` --branch ${DEFAULT_BRANCH}` : ''} {nightly ? ` --branch ${DEFAULT_BRANCH}` : ''}
@ -205,9 +228,6 @@ const QuickstartInstall = ({ id, title }) => {
<QS config="train" package="conda" comment prompt={false}> <QS config="train" package="conda" comment prompt={false}>
# packages only available via pip # packages only available via pip
</QS> </QS>
<QS config="train" package="conda">
pip install spacy-transformers
</QS>
<QS config="train" package="conda"> <QS config="train" package="conda">
pip install spacy-lookups-data pip install spacy-lookups-data
</QS> </QS>