Merge pull request #10100 from svlandeg/feature/master_copy

Update develop with latest from master (2)
This commit is contained in:
Sofie Van Landeghem 2022-01-20 14:29:50 +01:00 committed by GitHub
commit d2afdfefc2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
31 changed files with 331 additions and 91 deletions

View File

@ -108,8 +108,8 @@ apple =
thinc-apple-ops>=0.0.4,<1.0.0
# Language tokenizers with external dependencies
ja =
sudachipy>=0.4.9
sudachidict_core>=20200330
sudachipy>=0.5.2,!=0.6.1
sudachidict_core>=20211220
ko =
natto-py==0.9.0
th =

View File

@ -1,3 +1,6 @@
from .errors import Errors
IOB_STRINGS = ("", "I", "O", "B")
IDS = {
"": NULL_ATTR,
@ -64,7 +67,6 @@ IDS = {
"FLAG61": FLAG61,
"FLAG62": FLAG62,
"FLAG63": FLAG63,
"ID": ID,
"ORTH": ORTH,
"LOWER": LOWER,
@ -72,7 +74,6 @@ IDS = {
"SHAPE": SHAPE,
"PREFIX": PREFIX,
"SUFFIX": SUFFIX,
"LENGTH": LENGTH,
"LEMMA": LEMMA,
"POS": POS,
@ -87,7 +88,7 @@ IDS = {
"SPACY": SPACY,
"LANG": LANG,
"MORPH": MORPH,
"IDX": IDX
"IDX": IDX,
}
@ -109,28 +110,66 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
"""
inty_attrs = {}
if _do_deprecated:
if 'F' in stringy_attrs:
if "F" in stringy_attrs:
stringy_attrs["ORTH"] = stringy_attrs.pop("F")
if 'L' in stringy_attrs:
if "L" in stringy_attrs:
stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
if 'pos' in stringy_attrs:
if "pos" in stringy_attrs:
stringy_attrs["TAG"] = stringy_attrs.pop("pos")
if 'morph' in stringy_attrs:
morphs = stringy_attrs.pop('morph')
if 'number' in stringy_attrs:
stringy_attrs.pop('number')
if 'tenspect' in stringy_attrs:
stringy_attrs.pop('tenspect')
if "morph" in stringy_attrs:
morphs = stringy_attrs.pop("morph")
if "number" in stringy_attrs:
stringy_attrs.pop("number")
if "tenspect" in stringy_attrs:
stringy_attrs.pop("tenspect")
morph_keys = [
'PunctType', 'PunctSide', 'Other', 'Degree', 'AdvType', 'Number',
'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss',
'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType',
'Gender', 'Mood', 'Negative', 'Tense', 'Voice', 'Abbr',
'Derivation', 'Echo', 'Foreign', 'NameType', 'NounType', 'NumForm',
'NumValue', 'PartType', 'Polite', 'StyleVariant',
'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
'Polarity', 'PrepCase', 'Animacy' # U20
"PunctType",
"PunctSide",
"Other",
"Degree",
"AdvType",
"Number",
"VerbForm",
"PronType",
"Aspect",
"Tense",
"PartType",
"Poss",
"Hyph",
"ConjType",
"NumType",
"Foreign",
"VerbType",
"NounType",
"Gender",
"Mood",
"Negative",
"Tense",
"Voice",
"Abbr",
"Derivation",
"Echo",
"Foreign",
"NameType",
"NounType",
"NumForm",
"NumValue",
"PartType",
"Polite",
"StyleVariant",
"PronType",
"AdjType",
"Person",
"Variant",
"AdpType",
"Reflex",
"Negative",
"Mood",
"Aspect",
"Case",
"Polarity",
"PrepCase",
"Animacy", # U20
]
for key in morph_keys:
if key in stringy_attrs:
@ -142,8 +181,13 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
for name, value in stringy_attrs.items():
int_key = intify_attr(name)
if int_key is not None:
if int_key == ENT_IOB:
if value in IOB_STRINGS:
value = IOB_STRINGS.index(value)
elif isinstance(value, str):
raise ValueError(Errors.E1025.format(value=value))
if strings_map is not None and isinstance(value, str):
if hasattr(strings_map, 'add'):
if hasattr(strings_map, "add"):
value = strings_map.add(value)
else:
value = strings_map[value]

View File

@ -14,7 +14,7 @@ from ..training.initialize import get_sourced_components
from ..schemas import ConfigSchemaTraining
from ..pipeline._parser_internals import nonproj
from ..pipeline._parser_internals.nonproj import DELIMITER
from ..pipeline import Morphologizer
from ..pipeline import Morphologizer, SpanCategorizer
from ..morphology import Morphology
from ..language import Language
from ..util import registry, resolve_dot_names
@ -699,8 +699,34 @@ def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
return count
def _get_labels_from_model(nlp: Language, pipe_name: str) -> Set[str]:
if pipe_name not in nlp.pipe_names:
return set()
def _get_labels_from_model(
nlp: Language, factory_name: str
) -> Set[str]:
pipe_names = [
pipe_name
for pipe_name in nlp.pipe_names
if nlp.get_pipe_meta(pipe_name).factory == factory_name
]
labels: Set[str] = set()
for pipe_name in pipe_names:
pipe = nlp.get_pipe(pipe_name)
return set(pipe.labels)
labels.update(pipe.labels)
return labels
def _get_labels_from_spancat(
nlp: Language
) -> Dict[str, Set[str]]:
pipe_names = [
pipe_name
for pipe_name in nlp.pipe_names
if nlp.get_pipe_meta(pipe_name).factory == "spancat"
]
labels: Dict[str, Set[str]] = {}
for pipe_name in pipe_names:
pipe = nlp.get_pipe(pipe_name)
assert isinstance(pipe, SpanCategorizer)
if pipe.key not in labels:
labels[pipe.key] = set()
labels[pipe.key].update(pipe.labels)
return labels

View File

@ -18,7 +18,7 @@ DEFAULT_LABEL_COLORS = {
"LOC": "#ff9561",
"PERSON": "#aa9cfc",
"NORP": "#c887fb",
"FACILITY": "#9cc9cc",
"FAC": "#9cc9cc",
"EVENT": "#ffeb80",
"LAW": "#ff8197",
"LANGUAGE": "#ff8197",

View File

@ -888,9 +888,12 @@ class Errors(metaclass=ErrorsWithCodes):
E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
"Non-UD tags should use the `tag` property.")
E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't exist.")
E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler patterns.")
E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't "
"exist.")
E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler "
"patterns.")
E1025 = ("Cannot intify the value '{value}' as an IOB string. The only "
"supported values are: 'I', 'O', 'B' and ''")
# Deprecated model shortcuts, only used in errors and warnings

View File

@ -1285,9 +1285,9 @@ class Language:
)
except IOError:
raise IOError(Errors.E884.format(vectors=I["vectors"]))
if self.vocab.vectors.data.shape[1] >= 1:
if self.vocab.vectors.shape[1] >= 1:
ops = get_current_ops()
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
self.vocab.vectors.to_ops(ops)
if hasattr(self.tokenizer, "initialize"):
tok_settings = validate_init_settings(
self.tokenizer.initialize, # type: ignore[union-attr]
@ -1332,8 +1332,8 @@ class Language:
DOCS: https://spacy.io/api/language#resume_training
"""
ops = get_current_ops()
if self.vocab.vectors.data.shape[1] >= 1:
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
if self.vocab.vectors.shape[1] >= 1:
self.vocab.vectors.to_ops(ops)
for name, proc in self.pipeline:
if hasattr(proc, "_rehearsal_model"):
proc._rehearsal_model = deepcopy(proc.model) # type: ignore[attr-defined]

View File

@ -130,7 +130,9 @@ cdef class Lexeme:
return 0.0
vector = self.vector
xp = get_array_module(vector)
return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
return result.item()
@property
def has_vector(self):

View File

@ -18,7 +18,7 @@ from ..tokens.doc cimport Doc, get_token_attr_for_matcher
from ..tokens.span cimport Span
from ..tokens.token cimport Token
from ..tokens.morphanalysis cimport MorphAnalysis
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB
from ..schemas import validate_token_pattern
from ..errors import Errors, MatchPatternError, Warnings
@ -798,6 +798,9 @@ def _get_attr_values(spec, string_store):
attr = "SENT_START"
attr = IDS.get(attr)
if isinstance(value, str):
if attr == ENT_IOB and value in Token.iob_strings():
value = Token.iob_strings().index(value)
else:
value = string_store.add(value)
elif isinstance(value, bool):
value = int(value)

View File

@ -23,7 +23,7 @@ def create_pretrain_vectors(
maxout_pieces: int, hidden_size: int, loss: str
) -> Callable[["Vocab", Model], Model]:
def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model:
if vocab.vectors.data.shape[1] == 0:
if vocab.vectors.shape[1] == 0:
raise ValueError(Errors.E875)
model = build_cloze_multi_task_model(
vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
@ -116,7 +116,7 @@ def build_multi_task_model(
def build_cloze_multi_task_model(
vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int
) -> Model:
nO = vocab.vectors.data.shape[1]
nO = vocab.vectors.shape[1]
output_layer = chain(
cast(Model[List["Floats2d"], Floats2d], list2array()),
Maxout(

View File

@ -94,7 +94,7 @@ def init(
nM = model.get_dim("nM") if model.has_dim("nM") else None
nO = model.get_dim("nO") if model.has_dim("nO") else None
if X is not None and len(X):
nM = X[0].vocab.vectors.data.shape[1]
nM = X[0].vocab.vectors.shape[1]
if Y is not None:
nO = Y.data.shape[1]

View File

@ -1,5 +1,6 @@
from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
from typing import Iterable, TypeVar, TYPE_CHECKING
from .compat import Literal
from enum import Enum
from pydantic import BaseModel, Field, ValidationError, validator, create_model
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
@ -209,6 +210,7 @@ NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat]
UnderscoreValue = Union[
TokenPatternString, TokenPatternNumber, str, int, float, list, bool
]
IobValue = Literal["", "I", "O", "B", 0, 1, 2, 3]
class TokenPattern(BaseModel):
@ -222,6 +224,7 @@ class TokenPattern(BaseModel):
lemma: Optional[StringValue] = None
shape: Optional[StringValue] = None
ent_type: Optional[StringValue] = None
ent_iob: Optional[IobValue] = None
ent_id: Optional[StringValue] = None
ent_kb_id: Optional[StringValue] = None
norm: Optional[StringValue] = None

View File

@ -567,6 +567,7 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
"Merging the docs is fun.",
"",
"They don't think alike. ",
"",
"Another doc.",
]
en_texts_without_empty = [t for t in en_texts if len(t)]
@ -574,9 +575,9 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
en_docs = [en_tokenizer(text) for text in en_texts]
en_docs[0].spans["group"] = [en_docs[0][1:4]]
en_docs[2].spans["group"] = [en_docs[2][1:4]]
en_docs[3].spans["group"] = [en_docs[3][0:1]]
en_docs[4].spans["group"] = [en_docs[4][0:1]]
span_group_texts = sorted(
[en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text]
[en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[4][0:1].text]
)
de_doc = de_tokenizer(de_text)
Token.set_extension("is_ambiguous", default=False)

View File

@ -1,4 +1,5 @@
import pytest
from spacy.attrs import intify_attrs, ENT_IOB
from spacy.attrs import IS_ALPHA, LEMMA, NORM, ORTH, intify_attrs
from spacy.lang.en.stop_words import STOP_WORDS
@ -33,6 +34,38 @@ def test_attrs_do_deprecated(text):
assert int_attrs == {ORTH: 10, IS_ALPHA: True}
def test_attrs_ent_iob_intify():
int_attrs = intify_attrs({"ENT_IOB": ""})
assert int_attrs == {ENT_IOB: 0}
int_attrs = intify_attrs({"ENT_IOB": "I"})
assert int_attrs == {ENT_IOB: 1}
int_attrs = intify_attrs({"ENT_IOB": "O"})
assert int_attrs == {ENT_IOB: 2}
int_attrs = intify_attrs({"ENT_IOB": "B"})
assert int_attrs == {ENT_IOB: 3}
int_attrs = intify_attrs({ENT_IOB: ""})
assert int_attrs == {ENT_IOB: 0}
int_attrs = intify_attrs({ENT_IOB: "I"})
assert int_attrs == {ENT_IOB: 1}
int_attrs = intify_attrs({ENT_IOB: "O"})
assert int_attrs == {ENT_IOB: 2}
int_attrs = intify_attrs({ENT_IOB: "B"})
assert int_attrs == {ENT_IOB: 3}
with pytest.raises(ValueError):
int_attrs = intify_attrs({"ENT_IOB": "XX"})
with pytest.raises(ValueError):
int_attrs = intify_attrs({ENT_IOB: "XX"})
@pytest.mark.parametrize("text,match", [(",", True), (" ", False), ("a", False)])
def test_lex_attrs_is_punct(text, match):
assert is_punct(text) == match

View File

@ -642,3 +642,30 @@ def test_matcher_no_zero_length(en_vocab):
matcher = Matcher(en_vocab)
matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]])
assert len(matcher(doc)) == 0
def test_matcher_ent_iob_key(en_vocab):
"""Test that patterns with ent_iob works correctly."""
matcher = Matcher(en_vocab)
matcher.add("Rule", [[{"ENT_IOB": "I"}]])
doc1 = Doc(en_vocab, words=["I", "visited", "New", "York", "and", "California"])
doc1.ents = [Span(doc1, 2, 4, label="GPE"), Span(doc1, 5, 6, label="GPE")]
doc2 = Doc(en_vocab, words=["I", "visited", "my", "friend", "Alicia"])
doc2.ents = [Span(doc2, 4, 5, label="PERSON")]
matches1 = [doc1[start:end].text for _, start, end in matcher(doc1)]
matches2 = [doc2[start:end].text for _, start, end in matcher(doc2)]
assert len(matches1) == 1
assert matches1[0] == "York"
assert len(matches2) == 0
matcher = Matcher(en_vocab) # Test iob pattern with operators
matcher.add("Rule", [[{"ENT_IOB": "I", "OP": "+"}]])
doc = Doc(
en_vocab, words=["I", "visited", "my", "friend", "Anna", "Maria", "Esperanza"]
)
doc.ents = [Span(doc, 4, 7, label="PERSON")]
matches = [doc[start:end].text for _, start, end in matcher(doc)]
assert len(matches) == 3
assert matches[0] == "Maria"
assert matches[1] == "Maria Esperanza"
assert matches[2] == "Esperanza"

View File

@ -12,6 +12,7 @@ TEST_PATTERNS = [
([{"IS_PUNCT": True, "OP": "$"}], 1, 1),
([{"_": "foo"}], 1, 1),
('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1),
([{"ENT_IOB": "foo"}], 1, 1),
([1, 2, 3], 3, 1),
# Bad patterns flagged outside of Matcher
([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0), # prev: (1, 0)

View File

@ -79,7 +79,8 @@ def test_explicit_labels():
nlp.initialize()
assert spancat.labels == ("PERSON", "LOC")
#TODO figure out why this is flaky
@pytest.mark.skip(reason="Test is unreliable for unknown reason")
def test_doc_gc():
# If the Doc object is garbage collected, the spans won't be functional afterwards
nlp = Language()
@ -97,6 +98,7 @@ def test_doc_gc():
assert isinstance(spangroups, SpanGroups)
for key, spangroup in spangroups.items():
assert isinstance(spangroup, SpanGroup)
# XXX This fails with length 0 sometimes
assert len(spangroup) > 0
with pytest.raises(RuntimeError):
span = spangroup[0]

View File

@ -12,6 +12,8 @@ from spacy.cli._util import is_subpath_of, load_project_config
from spacy.cli._util import parse_config_overrides, string_to_list
from spacy.cli._util import substitute_project_variables
from spacy.cli._util import validate_project_commands
from spacy.cli.debug_data import _get_labels_from_model
from spacy.cli.debug_data import _get_labels_from_spancat
from spacy.cli.download import get_compatibility, get_version
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
from spacy.cli.package import get_third_party_dependencies
@ -665,3 +667,28 @@ def test_get_third_party_dependencies():
)
def test_is_subpath_of(parent, child, expected):
assert is_subpath_of(parent, child) == expected
@pytest.mark.slow
@pytest.mark.parametrize(
"factory_name,pipe_name",
[
("ner", "ner"),
("ner", "my_ner"),
("spancat", "spancat"),
("spancat", "my_spancat"),
],
)
def test_get_labels_from_model(factory_name, pipe_name):
labels = ("A", "B")
nlp = English()
pipe = nlp.add_pipe(factory_name, name=pipe_name)
for label in labels:
pipe.add_label(label)
nlp.initialize()
assert nlp.get_pipe(pipe_name).labels == labels
if factory_name == "spancat":
assert _get_labels_from_spancat(nlp)[pipe.key] == set(labels)
else:
assert _get_labels_from_model(nlp, factory_name) == set(labels)

View File

@ -35,6 +35,7 @@ def test_vectors_similarity_LL(vocab, vectors):
assert lex1.vector_norm != 0
assert lex2.vector_norm != 0
assert lex1.vector[0] != lex2.vector[0] and lex1.vector[1] != lex2.vector[1]
assert isinstance(lex1.similarity(lex2), float)
assert numpy.isclose(lex1.similarity(lex2), get_cosine(vec1, vec2))
assert numpy.isclose(lex2.similarity(lex2), lex1.similarity(lex1))
@ -47,25 +48,46 @@ def test_vectors_similarity_TT(vocab, vectors):
assert doc[0].vector_norm != 0
assert doc[1].vector_norm != 0
assert doc[0].vector[0] != doc[1].vector[0] and doc[0].vector[1] != doc[1].vector[1]
assert isinstance(doc[0].similarity(doc[1]), float)
assert numpy.isclose(doc[0].similarity(doc[1]), get_cosine(vec1, vec2))
assert numpy.isclose(doc[1].similarity(doc[0]), doc[0].similarity(doc[1]))
def test_vectors_similarity_SS(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
doc = Doc(vocab, words=[word1, word2])
assert isinstance(doc[0:1].similarity(doc[0:2]), float)
assert doc[0:1].similarity(doc[0:2]) == doc[0:2].similarity(doc[0:1])
def test_vectors_similarity_DD(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
doc1 = Doc(vocab, words=[word1, word2])
doc2 = Doc(vocab, words=[word2, word1])
assert isinstance(doc1.similarity(doc2), float)
assert doc1.similarity(doc2) == doc2.similarity(doc1)
def test_vectors_similarity_TD(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
doc = Doc(vocab, words=[word1, word2])
with pytest.warns(UserWarning):
assert isinstance(doc.similarity(doc[0]), float)
assert isinstance(doc[0].similarity(doc), float)
assert doc.similarity(doc[0]) == doc[0].similarity(doc)
def test_vectors_similarity_DS(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
doc = Doc(vocab, words=[word1, word2])
assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
def test_vectors_similarity_TS(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
doc = Doc(vocab, words=[word1, word2])
with pytest.warns(UserWarning):
assert isinstance(doc[:2].similarity(doc[0]), float)
assert isinstance(doc[0].similarity(doc[-2]), float)
assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2])
def test_vectors_similarity_DS(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
doc = Doc(vocab, words=[word1, word2])
assert isinstance(doc.similarity(doc[:2]), float)
assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)

View File

@ -421,7 +421,7 @@ def test_vector_is_oov():
def test_init_vectors_unset():
v = Vectors(shape=(10, 10))
assert v.is_full is False
assert v.data.shape == (10, 10)
assert v.shape == (10, 10)
with pytest.raises(ValueError):
v = Vectors(shape=(10, 10), mode="floret")
@ -514,7 +514,7 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str):
# rows: 2 rows per ngram
rows = OPS.xp.asarray(
[
h % nlp.vocab.vectors.data.shape[0]
h % nlp.vocab.vectors.shape[0]
for ngram in ngrams
for h in nlp.vocab.vectors._get_ngram_hashes(ngram)
],
@ -544,17 +544,17 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str):
# an empty key returns 0s
assert_equal(
OPS.to_numpy(nlp.vocab[""].vector),
numpy.zeros((nlp.vocab.vectors.data.shape[0],)),
numpy.zeros((nlp.vocab.vectors.shape[0],)),
)
# an empty batch returns 0s
assert_equal(
OPS.to_numpy(nlp.vocab.vectors.get_batch([""])),
numpy.zeros((1, nlp.vocab.vectors.data.shape[0])),
numpy.zeros((1, nlp.vocab.vectors.shape[0])),
)
# an empty key within a batch returns 0s
assert_equal(
OPS.to_numpy(nlp.vocab.vectors.get_batch(["a", "", "b"])[1]),
numpy.zeros((nlp.vocab.vectors.data.shape[0],)),
numpy.zeros((nlp.vocab.vectors.shape[0],)),
)
# the loaded ngram vector table cannot be modified

View File

@ -616,7 +616,7 @@ cdef class Doc:
"""
if "has_vector" in self.user_hooks:
return self.user_hooks["has_vector"](self)
elif self.vocab.vectors.data.size:
elif self.vocab.vectors.size:
return True
elif self.tensor.size:
return True
@ -641,7 +641,7 @@ cdef class Doc:
if not len(self):
self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f")
return self._vector
elif self.vocab.vectors.data.size > 0:
elif self.vocab.vectors.size > 0:
self._vector = sum(t.vector for t in self) / len(self)
return self._vector
elif self.tensor.size > 0:
@ -1183,7 +1183,7 @@ cdef class Doc:
token_offset = -1
for doc in docs[:-1]:
token_offset += len(doc)
if not (len(doc) > 0 and doc[-1].is_space):
if len(doc) > 0 and not doc[-1].is_space:
concat_spaces[token_offset] = True
concat_array = numpy.concatenate(arrays)

View File

@ -352,7 +352,9 @@ cdef class Span:
return 0.0
vector = self.vector
xp = get_array_module(vector)
return xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
return result.item()
cpdef np.ndarray to_array(self, object py_attr_ids):
"""Given a list of M attribute IDs, export the tokens to a numpy
@ -485,7 +487,7 @@ cdef class Span:
"""
if "has_vector" in self.doc.user_span_hooks:
return self.doc.user_span_hooks["has_vector"](self)
elif self.vocab.vectors.data.size > 0:
elif self.vocab.vectors.size > 0:
return any(token.has_vector for token in self)
elif self.doc.tensor.size > 0:
return True

View File

@ -20,6 +20,7 @@ from .doc cimport set_children_from_heads
from .. import parts_of_speech
from ..errors import Errors, Warnings
from ..attrs import IOB_STRINGS
from .underscore import Underscore, get_ext_args
@ -209,7 +210,9 @@ cdef class Token:
return 0.0
vector = self.vector
xp = get_array_module(vector)
return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
return result.item()
def has_morph(self):
"""Check whether the token has annotated morph information.
@ -743,7 +746,7 @@ cdef class Token:
@classmethod
def iob_strings(cls):
return ("", "I", "O", "B")
return IOB_STRINGS
@property
def ent_iob_(self):

View File

@ -164,7 +164,7 @@ def load_vectors_into_model(
len(vectors_nlp.vocab.vectors.keys()) == 0
and vectors_nlp.vocab.vectors.mode != VectorsMode.floret
) or (
vectors_nlp.vocab.vectors.data.shape[0] == 0
vectors_nlp.vocab.vectors.shape[0] == 0
and vectors_nlp.vocab.vectors.mode == VectorsMode.floret
):
logger.warning(Warnings.W112.format(name=name))

View File

@ -10,7 +10,7 @@ from typing import cast
import warnings
from enum import Enum
import srsly
from thinc.api import get_array_module, get_current_ops
from thinc.api import Ops, get_array_module, get_current_ops
from thinc.backends import get_array_ops
from thinc.types import Floats2d
@ -146,7 +146,7 @@ cdef class Vectors:
DOCS: https://spacy.io/api/vectors#size
"""
return self.data.shape[0] * self.data.shape[1]
return self.data.size
@property
def is_full(self):
@ -517,6 +517,9 @@ cdef class Vectors:
for i in range(len(queries)) ], dtype="uint64")
return (keys, best_rows, scores)
def to_ops(self, ops: Ops):
self.data = ops.asarray(self.data)
def _get_cfg(self):
if self.mode == Mode.default:
return {

View File

@ -283,7 +283,7 @@ cdef class Vocab:
@property
def vectors_length(self):
return self.vectors.data.shape[1]
return self.vectors.shape[1]
def reset_vectors(self, *, width=None, shape=None):
"""Drop the current vector table. Because all vectors must be the same
@ -294,7 +294,7 @@ cdef class Vocab:
elif shape is not None:
self.vectors = Vectors(strings=self.strings, shape=shape)
else:
width = width if width is not None else self.vectors.data.shape[1]
width = width if width is not None else self.vectors.shape[1]
self.vectors = Vectors(strings=self.strings, shape=(self.vectors.shape[0], width))
def prune_vectors(self, nr_row, batch_size=1024):

View File

@ -99,9 +99,9 @@ be a token pattern (list) or a phrase pattern (string). For example:
## EntityRuler.initialize {#initialize tag="method" new="3"}
Initialize the component with data and used before training to load in rules
from a file. This method is typically called by
[`Language.initialize`](/api/language#initialize) and lets you customize
arguments it receives via the
from a [pattern file](/usage/rule-based-matching/#entityruler-files). This method
is typically called by [`Language.initialize`](/api/language#initialize) and
lets you customize arguments it receives via the
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
config.

View File

@ -44,6 +44,7 @@ rule-based matching are:
| `SPACY` | Token has a trailing space. ~~bool~~ |
|  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~ |
| `ENT_TYPE` | The token's entity label. ~~str~~ |
| `ENT_IOB` | The IOB part of the token's entity tag. ~~str~~ |
| `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ |
| `ENT_KB_ID` | The token's entity knowledge base ID (`ent_kb_id`). ~~str~~ |
| `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |

View File

@ -371,6 +371,23 @@ Get the vectors for the provided keys efficiently as a batch.
| ------ | --------------------------------------- |
| `keys` | The keys. ~~Iterable[Union[int, str]]~~ |
## Vectors.to_ops {#to_ops tag="method"}
Change the embedding matrix to use different Thinc ops.
> #### Example
>
> ```python
> from thinc.api import NumpyOps
>
> vectors.to_ops(NumpyOps())
>
> ```
| Name | Description |
|-------|----------------------------------------------------------|
| `ops` | The Thinc ops to switch the embedding matrix to. ~~Ops~~ |
## Vectors.to_disk {#to_disk tag="method"}
Save the current state to a directory.

View File

@ -1479,7 +1479,7 @@ especially useful it you want to pass in a string instead of calling
### Example: Pipeline component for GPE entities and country meta data via a REST API {#component-example3}
This example shows the implementation of a pipeline component that fetches
country meta data via the [REST Countries API](https://restcountries.eu), sets
country meta data via the [REST Countries API](https://restcountries.com), sets
entity annotations for countries and sets custom attributes on the `Doc` and
`Span` for example, the capital, latitude/longitude coordinates and even the
country flag.
@ -1495,7 +1495,7 @@ from spacy.tokens import Doc, Span, Token
@Language.factory("rest_countries")
class RESTCountriesComponent:
def __init__(self, nlp, name, label="GPE"):
r = requests.get("https://restcountries.eu/rest/v2/all")
r = requests.get("https://restcountries.com/v2/all")
r.raise_for_status() # make sure requests raises an error if it fails
countries = r.json()
# Convert API response to dict keyed by country name for easy lookup

View File

@ -1770,9 +1770,9 @@
"title": "Applied Language Technology",
"slogan": "NLP for newcomers using spaCy and Stanza",
"description": "These learning materials provide an introduction to applied language technology for audiences who are unfamiliar with language technology and programming. The learning materials assume no previous knowledge of the Python programming language.",
"url": "https://applied-language-technology.readthedocs.io/",
"url": "https://applied-language-technology.mooc.fi",
"image": "https://www.mv.helsinki.fi/home/thiippal/images/applt-preview.jpg",
"thumb": "https://applied-language-technology.readthedocs.io/en/latest/_static/logo.png",
"thumb": "https://www.mv.helsinki.fi/home/thiippal/images/applt-logo.png",
"author": "Tuomo Hiippala",
"author_links": {
"twitter": "tuomo_h",

View File

@ -113,8 +113,7 @@ const QuickstartInstall = ({ id, title }) => {
{
id: 'venv',
title: 'virtual env',
help:
'Use a virtual environment and install spaCy into a user directory',
help: 'Use a virtual environment',
},
{
id: 'train',
@ -165,27 +164,51 @@ const QuickstartInstall = ({ id, title }) => {
setters={setters}
showDropdown={showDropdown}
>
<QS config="venv">python -m venv .env</QS>
<QS config="venv" os="mac">
<QS package="pip" config="venv">
python -m venv .env
</QS>
<QS package="pip" config="venv" os="mac">
source .env/bin/activate
</QS>
<QS config="venv" os="linux">
<QS package="pip" config="venv" os="linux">
source .env/bin/activate
</QS>
<QS config="venv" os="windows">
<QS package="pip" config="venv" os="windows">
.env\Scripts\activate
</QS>
<QS package="source" config="venv">
python -m venv .env
</QS>
<QS package="source" config="venv" os="mac">
source .env/bin/activate
</QS>
<QS package="source" config="venv" os="linux">
source .env/bin/activate
</QS>
<QS package="source" config="venv" os="windows">
.env\Scripts\activate
</QS>
<QS package="conda" config="venv">
conda create -n venv
</QS>
<QS package="conda" config="venv">
conda activate venv
</QS>
<QS package="pip">pip install -U pip setuptools wheel</QS>
<QS package="source">pip install -U pip setuptools wheel</QS>
<QS package="pip">
pip install -U {pkg}
{pipExtras && `[${pipExtras}]`}
{pipExtras
? `pip install -U '${pkg}[${pipExtras}]'`
: `pip install -U ${pkg}`}
{nightly ? ' --pre' : ''}
</QS>
<QS package="conda">conda install -c conda-forge spacy</QS>
<QS package="conda" hardware="gpu">
conda install -c conda-forge cupy
</QS>
<QS package="conda" config="train">
conda install -c conda-forge spacy-transformers
</QS>
<QS package="source">
git clone https://github.com/{repo}
{nightly ? ` --branch ${DEFAULT_BRANCH}` : ''}
@ -205,9 +228,6 @@ const QuickstartInstall = ({ id, title }) => {
<QS config="train" package="conda" comment prompt={false}>
# packages only available via pip
</QS>
<QS config="train" package="conda">
pip install spacy-transformers
</QS>
<QS config="train" package="conda">
pip install spacy-lookups-data
</QS>