mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-03 05:04:09 +03:00
Merge pull request #10100 from svlandeg/feature/master_copy
Update develop with latest from master (2)
This commit is contained in:
commit
d2afdfefc2
|
@ -108,8 +108,8 @@ apple =
|
|||
thinc-apple-ops>=0.0.4,<1.0.0
|
||||
# Language tokenizers with external dependencies
|
||||
ja =
|
||||
sudachipy>=0.4.9
|
||||
sudachidict_core>=20200330
|
||||
sudachipy>=0.5.2,!=0.6.1
|
||||
sudachidict_core>=20211220
|
||||
ko =
|
||||
natto-py==0.9.0
|
||||
th =
|
||||
|
|
|
@ -1,3 +1,6 @@
|
|||
from .errors import Errors
|
||||
|
||||
IOB_STRINGS = ("", "I", "O", "B")
|
||||
|
||||
IDS = {
|
||||
"": NULL_ATTR,
|
||||
|
@ -64,7 +67,6 @@ IDS = {
|
|||
"FLAG61": FLAG61,
|
||||
"FLAG62": FLAG62,
|
||||
"FLAG63": FLAG63,
|
||||
|
||||
"ID": ID,
|
||||
"ORTH": ORTH,
|
||||
"LOWER": LOWER,
|
||||
|
@ -72,7 +74,6 @@ IDS = {
|
|||
"SHAPE": SHAPE,
|
||||
"PREFIX": PREFIX,
|
||||
"SUFFIX": SUFFIX,
|
||||
|
||||
"LENGTH": LENGTH,
|
||||
"LEMMA": LEMMA,
|
||||
"POS": POS,
|
||||
|
@ -87,7 +88,7 @@ IDS = {
|
|||
"SPACY": SPACY,
|
||||
"LANG": LANG,
|
||||
"MORPH": MORPH,
|
||||
"IDX": IDX
|
||||
"IDX": IDX,
|
||||
}
|
||||
|
||||
|
||||
|
@ -109,28 +110,66 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
|||
"""
|
||||
inty_attrs = {}
|
||||
if _do_deprecated:
|
||||
if 'F' in stringy_attrs:
|
||||
if "F" in stringy_attrs:
|
||||
stringy_attrs["ORTH"] = stringy_attrs.pop("F")
|
||||
if 'L' in stringy_attrs:
|
||||
if "L" in stringy_attrs:
|
||||
stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
|
||||
if 'pos' in stringy_attrs:
|
||||
if "pos" in stringy_attrs:
|
||||
stringy_attrs["TAG"] = stringy_attrs.pop("pos")
|
||||
if 'morph' in stringy_attrs:
|
||||
morphs = stringy_attrs.pop('morph')
|
||||
if 'number' in stringy_attrs:
|
||||
stringy_attrs.pop('number')
|
||||
if 'tenspect' in stringy_attrs:
|
||||
stringy_attrs.pop('tenspect')
|
||||
if "morph" in stringy_attrs:
|
||||
morphs = stringy_attrs.pop("morph")
|
||||
if "number" in stringy_attrs:
|
||||
stringy_attrs.pop("number")
|
||||
if "tenspect" in stringy_attrs:
|
||||
stringy_attrs.pop("tenspect")
|
||||
morph_keys = [
|
||||
'PunctType', 'PunctSide', 'Other', 'Degree', 'AdvType', 'Number',
|
||||
'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss',
|
||||
'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType',
|
||||
'Gender', 'Mood', 'Negative', 'Tense', 'Voice', 'Abbr',
|
||||
'Derivation', 'Echo', 'Foreign', 'NameType', 'NounType', 'NumForm',
|
||||
'NumValue', 'PartType', 'Polite', 'StyleVariant',
|
||||
'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
|
||||
'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
|
||||
'Polarity', 'PrepCase', 'Animacy' # U20
|
||||
"PunctType",
|
||||
"PunctSide",
|
||||
"Other",
|
||||
"Degree",
|
||||
"AdvType",
|
||||
"Number",
|
||||
"VerbForm",
|
||||
"PronType",
|
||||
"Aspect",
|
||||
"Tense",
|
||||
"PartType",
|
||||
"Poss",
|
||||
"Hyph",
|
||||
"ConjType",
|
||||
"NumType",
|
||||
"Foreign",
|
||||
"VerbType",
|
||||
"NounType",
|
||||
"Gender",
|
||||
"Mood",
|
||||
"Negative",
|
||||
"Tense",
|
||||
"Voice",
|
||||
"Abbr",
|
||||
"Derivation",
|
||||
"Echo",
|
||||
"Foreign",
|
||||
"NameType",
|
||||
"NounType",
|
||||
"NumForm",
|
||||
"NumValue",
|
||||
"PartType",
|
||||
"Polite",
|
||||
"StyleVariant",
|
||||
"PronType",
|
||||
"AdjType",
|
||||
"Person",
|
||||
"Variant",
|
||||
"AdpType",
|
||||
"Reflex",
|
||||
"Negative",
|
||||
"Mood",
|
||||
"Aspect",
|
||||
"Case",
|
||||
"Polarity",
|
||||
"PrepCase",
|
||||
"Animacy", # U20
|
||||
]
|
||||
for key in morph_keys:
|
||||
if key in stringy_attrs:
|
||||
|
@ -142,8 +181,13 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
|||
for name, value in stringy_attrs.items():
|
||||
int_key = intify_attr(name)
|
||||
if int_key is not None:
|
||||
if int_key == ENT_IOB:
|
||||
if value in IOB_STRINGS:
|
||||
value = IOB_STRINGS.index(value)
|
||||
elif isinstance(value, str):
|
||||
raise ValueError(Errors.E1025.format(value=value))
|
||||
if strings_map is not None and isinstance(value, str):
|
||||
if hasattr(strings_map, 'add'):
|
||||
if hasattr(strings_map, "add"):
|
||||
value = strings_map.add(value)
|
||||
else:
|
||||
value = strings_map[value]
|
||||
|
|
|
@ -14,7 +14,7 @@ from ..training.initialize import get_sourced_components
|
|||
from ..schemas import ConfigSchemaTraining
|
||||
from ..pipeline._parser_internals import nonproj
|
||||
from ..pipeline._parser_internals.nonproj import DELIMITER
|
||||
from ..pipeline import Morphologizer
|
||||
from ..pipeline import Morphologizer, SpanCategorizer
|
||||
from ..morphology import Morphology
|
||||
from ..language import Language
|
||||
from ..util import registry, resolve_dot_names
|
||||
|
@ -699,8 +699,34 @@ def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
|
|||
return count
|
||||
|
||||
|
||||
def _get_labels_from_model(nlp: Language, pipe_name: str) -> Set[str]:
|
||||
if pipe_name not in nlp.pipe_names:
|
||||
return set()
|
||||
pipe = nlp.get_pipe(pipe_name)
|
||||
return set(pipe.labels)
|
||||
def _get_labels_from_model(
|
||||
nlp: Language, factory_name: str
|
||||
) -> Set[str]:
|
||||
pipe_names = [
|
||||
pipe_name
|
||||
for pipe_name in nlp.pipe_names
|
||||
if nlp.get_pipe_meta(pipe_name).factory == factory_name
|
||||
]
|
||||
labels: Set[str] = set()
|
||||
for pipe_name in pipe_names:
|
||||
pipe = nlp.get_pipe(pipe_name)
|
||||
labels.update(pipe.labels)
|
||||
return labels
|
||||
|
||||
|
||||
def _get_labels_from_spancat(
|
||||
nlp: Language
|
||||
) -> Dict[str, Set[str]]:
|
||||
pipe_names = [
|
||||
pipe_name
|
||||
for pipe_name in nlp.pipe_names
|
||||
if nlp.get_pipe_meta(pipe_name).factory == "spancat"
|
||||
]
|
||||
labels: Dict[str, Set[str]] = {}
|
||||
for pipe_name in pipe_names:
|
||||
pipe = nlp.get_pipe(pipe_name)
|
||||
assert isinstance(pipe, SpanCategorizer)
|
||||
if pipe.key not in labels:
|
||||
labels[pipe.key] = set()
|
||||
labels[pipe.key].update(pipe.labels)
|
||||
return labels
|
||||
|
|
|
@ -18,7 +18,7 @@ DEFAULT_LABEL_COLORS = {
|
|||
"LOC": "#ff9561",
|
||||
"PERSON": "#aa9cfc",
|
||||
"NORP": "#c887fb",
|
||||
"FACILITY": "#9cc9cc",
|
||||
"FAC": "#9cc9cc",
|
||||
"EVENT": "#ffeb80",
|
||||
"LAW": "#ff8197",
|
||||
"LANGUAGE": "#ff8197",
|
||||
|
|
|
@ -888,11 +888,14 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
|
||||
"Non-UD tags should use the `tag` property.")
|
||||
E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
|
||||
E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't exist.")
|
||||
E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler patterns.")
|
||||
E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't "
|
||||
"exist.")
|
||||
E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler "
|
||||
"patterns.")
|
||||
E1025 = ("Cannot intify the value '{value}' as an IOB string. The only "
|
||||
"supported values are: 'I', 'O', 'B' and ''")
|
||||
|
||||
|
||||
|
||||
# Deprecated model shortcuts, only used in errors and warnings
|
||||
OLD_MODEL_SHORTCUTS = {
|
||||
"en": "en_core_web_sm", "de": "de_core_news_sm", "es": "es_core_news_sm",
|
||||
|
|
|
@ -1285,9 +1285,9 @@ class Language:
|
|||
)
|
||||
except IOError:
|
||||
raise IOError(Errors.E884.format(vectors=I["vectors"]))
|
||||
if self.vocab.vectors.data.shape[1] >= 1:
|
||||
if self.vocab.vectors.shape[1] >= 1:
|
||||
ops = get_current_ops()
|
||||
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
||||
self.vocab.vectors.to_ops(ops)
|
||||
if hasattr(self.tokenizer, "initialize"):
|
||||
tok_settings = validate_init_settings(
|
||||
self.tokenizer.initialize, # type: ignore[union-attr]
|
||||
|
@ -1332,8 +1332,8 @@ class Language:
|
|||
DOCS: https://spacy.io/api/language#resume_training
|
||||
"""
|
||||
ops = get_current_ops()
|
||||
if self.vocab.vectors.data.shape[1] >= 1:
|
||||
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
||||
if self.vocab.vectors.shape[1] >= 1:
|
||||
self.vocab.vectors.to_ops(ops)
|
||||
for name, proc in self.pipeline:
|
||||
if hasattr(proc, "_rehearsal_model"):
|
||||
proc._rehearsal_model = deepcopy(proc.model) # type: ignore[attr-defined]
|
||||
|
|
|
@ -130,8 +130,10 @@ cdef class Lexeme:
|
|||
return 0.0
|
||||
vector = self.vector
|
||||
xp = get_array_module(vector)
|
||||
return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
|
||||
|
||||
result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
|
||||
return result.item()
|
||||
|
||||
@property
|
||||
def has_vector(self):
|
||||
"""RETURNS (bool): Whether a word vector is associated with the object.
|
||||
|
|
|
@ -18,7 +18,7 @@ from ..tokens.doc cimport Doc, get_token_attr_for_matcher
|
|||
from ..tokens.span cimport Span
|
||||
from ..tokens.token cimport Token
|
||||
from ..tokens.morphanalysis cimport MorphAnalysis
|
||||
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH
|
||||
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB
|
||||
|
||||
from ..schemas import validate_token_pattern
|
||||
from ..errors import Errors, MatchPatternError, Warnings
|
||||
|
@ -798,7 +798,10 @@ def _get_attr_values(spec, string_store):
|
|||
attr = "SENT_START"
|
||||
attr = IDS.get(attr)
|
||||
if isinstance(value, str):
|
||||
value = string_store.add(value)
|
||||
if attr == ENT_IOB and value in Token.iob_strings():
|
||||
value = Token.iob_strings().index(value)
|
||||
else:
|
||||
value = string_store.add(value)
|
||||
elif isinstance(value, bool):
|
||||
value = int(value)
|
||||
elif isinstance(value, int):
|
||||
|
|
|
@ -23,7 +23,7 @@ def create_pretrain_vectors(
|
|||
maxout_pieces: int, hidden_size: int, loss: str
|
||||
) -> Callable[["Vocab", Model], Model]:
|
||||
def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model:
|
||||
if vocab.vectors.data.shape[1] == 0:
|
||||
if vocab.vectors.shape[1] == 0:
|
||||
raise ValueError(Errors.E875)
|
||||
model = build_cloze_multi_task_model(
|
||||
vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
|
||||
|
@ -116,7 +116,7 @@ def build_multi_task_model(
|
|||
def build_cloze_multi_task_model(
|
||||
vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int
|
||||
) -> Model:
|
||||
nO = vocab.vectors.data.shape[1]
|
||||
nO = vocab.vectors.shape[1]
|
||||
output_layer = chain(
|
||||
cast(Model[List["Floats2d"], Floats2d], list2array()),
|
||||
Maxout(
|
||||
|
|
|
@ -94,7 +94,7 @@ def init(
|
|||
nM = model.get_dim("nM") if model.has_dim("nM") else None
|
||||
nO = model.get_dim("nO") if model.has_dim("nO") else None
|
||||
if X is not None and len(X):
|
||||
nM = X[0].vocab.vectors.data.shape[1]
|
||||
nM = X[0].vocab.vectors.shape[1]
|
||||
if Y is not None:
|
||||
nO = Y.data.shape[1]
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
|
||||
from typing import Iterable, TypeVar, TYPE_CHECKING
|
||||
from .compat import Literal
|
||||
from enum import Enum
|
||||
from pydantic import BaseModel, Field, ValidationError, validator, create_model
|
||||
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
|
||||
|
@ -209,6 +210,7 @@ NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat]
|
|||
UnderscoreValue = Union[
|
||||
TokenPatternString, TokenPatternNumber, str, int, float, list, bool
|
||||
]
|
||||
IobValue = Literal["", "I", "O", "B", 0, 1, 2, 3]
|
||||
|
||||
|
||||
class TokenPattern(BaseModel):
|
||||
|
@ -222,6 +224,7 @@ class TokenPattern(BaseModel):
|
|||
lemma: Optional[StringValue] = None
|
||||
shape: Optional[StringValue] = None
|
||||
ent_type: Optional[StringValue] = None
|
||||
ent_iob: Optional[IobValue] = None
|
||||
ent_id: Optional[StringValue] = None
|
||||
ent_kb_id: Optional[StringValue] = None
|
||||
norm: Optional[StringValue] = None
|
||||
|
|
|
@ -567,6 +567,7 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
|||
"Merging the docs is fun.",
|
||||
"",
|
||||
"They don't think alike. ",
|
||||
"",
|
||||
"Another doc.",
|
||||
]
|
||||
en_texts_without_empty = [t for t in en_texts if len(t)]
|
||||
|
@ -574,9 +575,9 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
|||
en_docs = [en_tokenizer(text) for text in en_texts]
|
||||
en_docs[0].spans["group"] = [en_docs[0][1:4]]
|
||||
en_docs[2].spans["group"] = [en_docs[2][1:4]]
|
||||
en_docs[3].spans["group"] = [en_docs[3][0:1]]
|
||||
en_docs[4].spans["group"] = [en_docs[4][0:1]]
|
||||
span_group_texts = sorted(
|
||||
[en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text]
|
||||
[en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[4][0:1].text]
|
||||
)
|
||||
de_doc = de_tokenizer(de_text)
|
||||
Token.set_extension("is_ambiguous", default=False)
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import pytest
|
||||
from spacy.attrs import intify_attrs, ENT_IOB
|
||||
|
||||
from spacy.attrs import IS_ALPHA, LEMMA, NORM, ORTH, intify_attrs
|
||||
from spacy.lang.en.stop_words import STOP_WORDS
|
||||
|
@ -33,6 +34,38 @@ def test_attrs_do_deprecated(text):
|
|||
assert int_attrs == {ORTH: 10, IS_ALPHA: True}
|
||||
|
||||
|
||||
def test_attrs_ent_iob_intify():
|
||||
int_attrs = intify_attrs({"ENT_IOB": ""})
|
||||
assert int_attrs == {ENT_IOB: 0}
|
||||
|
||||
int_attrs = intify_attrs({"ENT_IOB": "I"})
|
||||
assert int_attrs == {ENT_IOB: 1}
|
||||
|
||||
int_attrs = intify_attrs({"ENT_IOB": "O"})
|
||||
assert int_attrs == {ENT_IOB: 2}
|
||||
|
||||
int_attrs = intify_attrs({"ENT_IOB": "B"})
|
||||
assert int_attrs == {ENT_IOB: 3}
|
||||
|
||||
int_attrs = intify_attrs({ENT_IOB: ""})
|
||||
assert int_attrs == {ENT_IOB: 0}
|
||||
|
||||
int_attrs = intify_attrs({ENT_IOB: "I"})
|
||||
assert int_attrs == {ENT_IOB: 1}
|
||||
|
||||
int_attrs = intify_attrs({ENT_IOB: "O"})
|
||||
assert int_attrs == {ENT_IOB: 2}
|
||||
|
||||
int_attrs = intify_attrs({ENT_IOB: "B"})
|
||||
assert int_attrs == {ENT_IOB: 3}
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
int_attrs = intify_attrs({"ENT_IOB": "XX"})
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
int_attrs = intify_attrs({ENT_IOB: "XX"})
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,match", [(",", True), (" ", False), ("a", False)])
|
||||
def test_lex_attrs_is_punct(text, match):
|
||||
assert is_punct(text) == match
|
||||
|
|
|
@ -642,3 +642,30 @@ def test_matcher_no_zero_length(en_vocab):
|
|||
matcher = Matcher(en_vocab)
|
||||
matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]])
|
||||
assert len(matcher(doc)) == 0
|
||||
|
||||
|
||||
def test_matcher_ent_iob_key(en_vocab):
|
||||
"""Test that patterns with ent_iob works correctly."""
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("Rule", [[{"ENT_IOB": "I"}]])
|
||||
doc1 = Doc(en_vocab, words=["I", "visited", "New", "York", "and", "California"])
|
||||
doc1.ents = [Span(doc1, 2, 4, label="GPE"), Span(doc1, 5, 6, label="GPE")]
|
||||
doc2 = Doc(en_vocab, words=["I", "visited", "my", "friend", "Alicia"])
|
||||
doc2.ents = [Span(doc2, 4, 5, label="PERSON")]
|
||||
matches1 = [doc1[start:end].text for _, start, end in matcher(doc1)]
|
||||
matches2 = [doc2[start:end].text for _, start, end in matcher(doc2)]
|
||||
assert len(matches1) == 1
|
||||
assert matches1[0] == "York"
|
||||
assert len(matches2) == 0
|
||||
|
||||
matcher = Matcher(en_vocab) # Test iob pattern with operators
|
||||
matcher.add("Rule", [[{"ENT_IOB": "I", "OP": "+"}]])
|
||||
doc = Doc(
|
||||
en_vocab, words=["I", "visited", "my", "friend", "Anna", "Maria", "Esperanza"]
|
||||
)
|
||||
doc.ents = [Span(doc, 4, 7, label="PERSON")]
|
||||
matches = [doc[start:end].text for _, start, end in matcher(doc)]
|
||||
assert len(matches) == 3
|
||||
assert matches[0] == "Maria"
|
||||
assert matches[1] == "Maria Esperanza"
|
||||
assert matches[2] == "Esperanza"
|
||||
|
|
|
@ -12,6 +12,7 @@ TEST_PATTERNS = [
|
|||
([{"IS_PUNCT": True, "OP": "$"}], 1, 1),
|
||||
([{"_": "foo"}], 1, 1),
|
||||
('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1),
|
||||
([{"ENT_IOB": "foo"}], 1, 1),
|
||||
([1, 2, 3], 3, 1),
|
||||
# Bad patterns flagged outside of Matcher
|
||||
([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0), # prev: (1, 0)
|
||||
|
|
|
@ -79,7 +79,8 @@ def test_explicit_labels():
|
|||
nlp.initialize()
|
||||
assert spancat.labels == ("PERSON", "LOC")
|
||||
|
||||
|
||||
#TODO figure out why this is flaky
|
||||
@pytest.mark.skip(reason="Test is unreliable for unknown reason")
|
||||
def test_doc_gc():
|
||||
# If the Doc object is garbage collected, the spans won't be functional afterwards
|
||||
nlp = Language()
|
||||
|
@ -97,6 +98,7 @@ def test_doc_gc():
|
|||
assert isinstance(spangroups, SpanGroups)
|
||||
for key, spangroup in spangroups.items():
|
||||
assert isinstance(spangroup, SpanGroup)
|
||||
# XXX This fails with length 0 sometimes
|
||||
assert len(spangroup) > 0
|
||||
with pytest.raises(RuntimeError):
|
||||
span = spangroup[0]
|
||||
|
|
|
@ -12,6 +12,8 @@ from spacy.cli._util import is_subpath_of, load_project_config
|
|||
from spacy.cli._util import parse_config_overrides, string_to_list
|
||||
from spacy.cli._util import substitute_project_variables
|
||||
from spacy.cli._util import validate_project_commands
|
||||
from spacy.cli.debug_data import _get_labels_from_model
|
||||
from spacy.cli.debug_data import _get_labels_from_spancat
|
||||
from spacy.cli.download import get_compatibility, get_version
|
||||
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
|
||||
from spacy.cli.package import get_third_party_dependencies
|
||||
|
@ -665,3 +667,28 @@ def test_get_third_party_dependencies():
|
|||
)
|
||||
def test_is_subpath_of(parent, child, expected):
|
||||
assert is_subpath_of(parent, child) == expected
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize(
|
||||
"factory_name,pipe_name",
|
||||
[
|
||||
("ner", "ner"),
|
||||
("ner", "my_ner"),
|
||||
("spancat", "spancat"),
|
||||
("spancat", "my_spancat"),
|
||||
],
|
||||
)
|
||||
def test_get_labels_from_model(factory_name, pipe_name):
|
||||
labels = ("A", "B")
|
||||
|
||||
nlp = English()
|
||||
pipe = nlp.add_pipe(factory_name, name=pipe_name)
|
||||
for label in labels:
|
||||
pipe.add_label(label)
|
||||
nlp.initialize()
|
||||
assert nlp.get_pipe(pipe_name).labels == labels
|
||||
if factory_name == "spancat":
|
||||
assert _get_labels_from_spancat(nlp)[pipe.key] == set(labels)
|
||||
else:
|
||||
assert _get_labels_from_model(nlp, factory_name) == set(labels)
|
||||
|
|
|
@ -35,6 +35,7 @@ def test_vectors_similarity_LL(vocab, vectors):
|
|||
assert lex1.vector_norm != 0
|
||||
assert lex2.vector_norm != 0
|
||||
assert lex1.vector[0] != lex2.vector[0] and lex1.vector[1] != lex2.vector[1]
|
||||
assert isinstance(lex1.similarity(lex2), float)
|
||||
assert numpy.isclose(lex1.similarity(lex2), get_cosine(vec1, vec2))
|
||||
assert numpy.isclose(lex2.similarity(lex2), lex1.similarity(lex1))
|
||||
|
||||
|
@ -47,25 +48,46 @@ def test_vectors_similarity_TT(vocab, vectors):
|
|||
assert doc[0].vector_norm != 0
|
||||
assert doc[1].vector_norm != 0
|
||||
assert doc[0].vector[0] != doc[1].vector[0] and doc[0].vector[1] != doc[1].vector[1]
|
||||
assert isinstance(doc[0].similarity(doc[1]), float)
|
||||
assert numpy.isclose(doc[0].similarity(doc[1]), get_cosine(vec1, vec2))
|
||||
assert numpy.isclose(doc[1].similarity(doc[0]), doc[0].similarity(doc[1]))
|
||||
|
||||
|
||||
def test_vectors_similarity_SS(vocab, vectors):
|
||||
[(word1, vec1), (word2, vec2)] = vectors
|
||||
doc = Doc(vocab, words=[word1, word2])
|
||||
assert isinstance(doc[0:1].similarity(doc[0:2]), float)
|
||||
assert doc[0:1].similarity(doc[0:2]) == doc[0:2].similarity(doc[0:1])
|
||||
|
||||
|
||||
def test_vectors_similarity_DD(vocab, vectors):
|
||||
[(word1, vec1), (word2, vec2)] = vectors
|
||||
doc1 = Doc(vocab, words=[word1, word2])
|
||||
doc2 = Doc(vocab, words=[word2, word1])
|
||||
assert isinstance(doc1.similarity(doc2), float)
|
||||
assert doc1.similarity(doc2) == doc2.similarity(doc1)
|
||||
|
||||
|
||||
def test_vectors_similarity_TD(vocab, vectors):
|
||||
[(word1, vec1), (word2, vec2)] = vectors
|
||||
doc = Doc(vocab, words=[word1, word2])
|
||||
with pytest.warns(UserWarning):
|
||||
assert isinstance(doc.similarity(doc[0]), float)
|
||||
assert isinstance(doc[0].similarity(doc), float)
|
||||
assert doc.similarity(doc[0]) == doc[0].similarity(doc)
|
||||
|
||||
|
||||
def test_vectors_similarity_DS(vocab, vectors):
|
||||
[(word1, vec1), (word2, vec2)] = vectors
|
||||
doc = Doc(vocab, words=[word1, word2])
|
||||
assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
|
||||
|
||||
|
||||
def test_vectors_similarity_TS(vocab, vectors):
|
||||
[(word1, vec1), (word2, vec2)] = vectors
|
||||
doc = Doc(vocab, words=[word1, word2])
|
||||
with pytest.warns(UserWarning):
|
||||
assert isinstance(doc[:2].similarity(doc[0]), float)
|
||||
assert isinstance(doc[0].similarity(doc[-2]), float)
|
||||
assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2])
|
||||
|
||||
|
||||
def test_vectors_similarity_DS(vocab, vectors):
|
||||
[(word1, vec1), (word2, vec2)] = vectors
|
||||
doc = Doc(vocab, words=[word1, word2])
|
||||
assert isinstance(doc.similarity(doc[:2]), float)
|
||||
assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
|
||||
|
|
|
@ -421,7 +421,7 @@ def test_vector_is_oov():
|
|||
def test_init_vectors_unset():
|
||||
v = Vectors(shape=(10, 10))
|
||||
assert v.is_full is False
|
||||
assert v.data.shape == (10, 10)
|
||||
assert v.shape == (10, 10)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
v = Vectors(shape=(10, 10), mode="floret")
|
||||
|
@ -514,7 +514,7 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str):
|
|||
# rows: 2 rows per ngram
|
||||
rows = OPS.xp.asarray(
|
||||
[
|
||||
h % nlp.vocab.vectors.data.shape[0]
|
||||
h % nlp.vocab.vectors.shape[0]
|
||||
for ngram in ngrams
|
||||
for h in nlp.vocab.vectors._get_ngram_hashes(ngram)
|
||||
],
|
||||
|
@ -544,17 +544,17 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str):
|
|||
# an empty key returns 0s
|
||||
assert_equal(
|
||||
OPS.to_numpy(nlp.vocab[""].vector),
|
||||
numpy.zeros((nlp.vocab.vectors.data.shape[0],)),
|
||||
numpy.zeros((nlp.vocab.vectors.shape[0],)),
|
||||
)
|
||||
# an empty batch returns 0s
|
||||
assert_equal(
|
||||
OPS.to_numpy(nlp.vocab.vectors.get_batch([""])),
|
||||
numpy.zeros((1, nlp.vocab.vectors.data.shape[0])),
|
||||
numpy.zeros((1, nlp.vocab.vectors.shape[0])),
|
||||
)
|
||||
# an empty key within a batch returns 0s
|
||||
assert_equal(
|
||||
OPS.to_numpy(nlp.vocab.vectors.get_batch(["a", "", "b"])[1]),
|
||||
numpy.zeros((nlp.vocab.vectors.data.shape[0],)),
|
||||
numpy.zeros((nlp.vocab.vectors.shape[0],)),
|
||||
)
|
||||
|
||||
# the loaded ngram vector table cannot be modified
|
||||
|
|
|
@ -616,7 +616,7 @@ cdef class Doc:
|
|||
"""
|
||||
if "has_vector" in self.user_hooks:
|
||||
return self.user_hooks["has_vector"](self)
|
||||
elif self.vocab.vectors.data.size:
|
||||
elif self.vocab.vectors.size:
|
||||
return True
|
||||
elif self.tensor.size:
|
||||
return True
|
||||
|
@ -641,7 +641,7 @@ cdef class Doc:
|
|||
if not len(self):
|
||||
self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f")
|
||||
return self._vector
|
||||
elif self.vocab.vectors.data.size > 0:
|
||||
elif self.vocab.vectors.size > 0:
|
||||
self._vector = sum(t.vector for t in self) / len(self)
|
||||
return self._vector
|
||||
elif self.tensor.size > 0:
|
||||
|
@ -1183,7 +1183,7 @@ cdef class Doc:
|
|||
token_offset = -1
|
||||
for doc in docs[:-1]:
|
||||
token_offset += len(doc)
|
||||
if not (len(doc) > 0 and doc[-1].is_space):
|
||||
if len(doc) > 0 and not doc[-1].is_space:
|
||||
concat_spaces[token_offset] = True
|
||||
|
||||
concat_array = numpy.concatenate(arrays)
|
||||
|
|
|
@ -352,8 +352,10 @@ cdef class Span:
|
|||
return 0.0
|
||||
vector = self.vector
|
||||
xp = get_array_module(vector)
|
||||
return xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
|
||||
result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
|
||||
return result.item()
|
||||
|
||||
cpdef np.ndarray to_array(self, object py_attr_ids):
|
||||
"""Given a list of M attribute IDs, export the tokens to a numpy
|
||||
`ndarray` of shape `(N, M)`, where `N` is the length of the document.
|
||||
|
@ -485,7 +487,7 @@ cdef class Span:
|
|||
"""
|
||||
if "has_vector" in self.doc.user_span_hooks:
|
||||
return self.doc.user_span_hooks["has_vector"](self)
|
||||
elif self.vocab.vectors.data.size > 0:
|
||||
elif self.vocab.vectors.size > 0:
|
||||
return any(token.has_vector for token in self)
|
||||
elif self.doc.tensor.size > 0:
|
||||
return True
|
||||
|
|
|
@ -20,6 +20,7 @@ from .doc cimport set_children_from_heads
|
|||
|
||||
from .. import parts_of_speech
|
||||
from ..errors import Errors, Warnings
|
||||
from ..attrs import IOB_STRINGS
|
||||
from .underscore import Underscore, get_ext_args
|
||||
|
||||
|
||||
|
@ -209,8 +210,10 @@ cdef class Token:
|
|||
return 0.0
|
||||
vector = self.vector
|
||||
xp = get_array_module(vector)
|
||||
return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
|
||||
|
||||
result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
|
||||
return result.item()
|
||||
|
||||
def has_morph(self):
|
||||
"""Check whether the token has annotated morph information.
|
||||
Return False when the morph annotation is unset/missing.
|
||||
|
@ -743,7 +746,7 @@ cdef class Token:
|
|||
|
||||
@classmethod
|
||||
def iob_strings(cls):
|
||||
return ("", "I", "O", "B")
|
||||
return IOB_STRINGS
|
||||
|
||||
@property
|
||||
def ent_iob_(self):
|
||||
|
|
|
@ -164,7 +164,7 @@ def load_vectors_into_model(
|
|||
len(vectors_nlp.vocab.vectors.keys()) == 0
|
||||
and vectors_nlp.vocab.vectors.mode != VectorsMode.floret
|
||||
) or (
|
||||
vectors_nlp.vocab.vectors.data.shape[0] == 0
|
||||
vectors_nlp.vocab.vectors.shape[0] == 0
|
||||
and vectors_nlp.vocab.vectors.mode == VectorsMode.floret
|
||||
):
|
||||
logger.warning(Warnings.W112.format(name=name))
|
||||
|
|
|
@ -10,7 +10,7 @@ from typing import cast
|
|||
import warnings
|
||||
from enum import Enum
|
||||
import srsly
|
||||
from thinc.api import get_array_module, get_current_ops
|
||||
from thinc.api import Ops, get_array_module, get_current_ops
|
||||
from thinc.backends import get_array_ops
|
||||
from thinc.types import Floats2d
|
||||
|
||||
|
@ -146,7 +146,7 @@ cdef class Vectors:
|
|||
|
||||
DOCS: https://spacy.io/api/vectors#size
|
||||
"""
|
||||
return self.data.shape[0] * self.data.shape[1]
|
||||
return self.data.size
|
||||
|
||||
@property
|
||||
def is_full(self):
|
||||
|
@ -517,6 +517,9 @@ cdef class Vectors:
|
|||
for i in range(len(queries)) ], dtype="uint64")
|
||||
return (keys, best_rows, scores)
|
||||
|
||||
def to_ops(self, ops: Ops):
|
||||
self.data = ops.asarray(self.data)
|
||||
|
||||
def _get_cfg(self):
|
||||
if self.mode == Mode.default:
|
||||
return {
|
||||
|
|
|
@ -283,7 +283,7 @@ cdef class Vocab:
|
|||
|
||||
@property
|
||||
def vectors_length(self):
|
||||
return self.vectors.data.shape[1]
|
||||
return self.vectors.shape[1]
|
||||
|
||||
def reset_vectors(self, *, width=None, shape=None):
|
||||
"""Drop the current vector table. Because all vectors must be the same
|
||||
|
@ -294,7 +294,7 @@ cdef class Vocab:
|
|||
elif shape is not None:
|
||||
self.vectors = Vectors(strings=self.strings, shape=shape)
|
||||
else:
|
||||
width = width if width is not None else self.vectors.data.shape[1]
|
||||
width = width if width is not None else self.vectors.shape[1]
|
||||
self.vectors = Vectors(strings=self.strings, shape=(self.vectors.shape[0], width))
|
||||
|
||||
def prune_vectors(self, nr_row, batch_size=1024):
|
||||
|
|
|
@ -99,9 +99,9 @@ be a token pattern (list) or a phrase pattern (string). For example:
|
|||
## EntityRuler.initialize {#initialize tag="method" new="3"}
|
||||
|
||||
Initialize the component with data and used before training to load in rules
|
||||
from a file. This method is typically called by
|
||||
[`Language.initialize`](/api/language#initialize) and lets you customize
|
||||
arguments it receives via the
|
||||
from a [pattern file](/usage/rule-based-matching/#entityruler-files). This method
|
||||
is typically called by [`Language.initialize`](/api/language#initialize) and
|
||||
lets you customize arguments it receives via the
|
||||
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
||||
config.
|
||||
|
||||
|
|
|
@ -44,6 +44,7 @@ rule-based matching are:
|
|||
| `SPACY` | Token has a trailing space. ~~bool~~ |
|
||||
| `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~ |
|
||||
| `ENT_TYPE` | The token's entity label. ~~str~~ |
|
||||
| `ENT_IOB` | The IOB part of the token's entity tag. ~~str~~ |
|
||||
| `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ |
|
||||
| `ENT_KB_ID` | The token's entity knowledge base ID (`ent_kb_id`). ~~str~~ |
|
||||
| `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
|
||||
|
|
|
@ -371,6 +371,23 @@ Get the vectors for the provided keys efficiently as a batch.
|
|||
| ------ | --------------------------------------- |
|
||||
| `keys` | The keys. ~~Iterable[Union[int, str]]~~ |
|
||||
|
||||
## Vectors.to_ops {#to_ops tag="method"}
|
||||
|
||||
Change the embedding matrix to use different Thinc ops.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from thinc.api import NumpyOps
|
||||
>
|
||||
> vectors.to_ops(NumpyOps())
|
||||
>
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
|-------|----------------------------------------------------------|
|
||||
| `ops` | The Thinc ops to switch the embedding matrix to. ~~Ops~~ |
|
||||
|
||||
## Vectors.to_disk {#to_disk tag="method"}
|
||||
|
||||
Save the current state to a directory.
|
||||
|
|
|
@ -1479,7 +1479,7 @@ especially useful it you want to pass in a string instead of calling
|
|||
### Example: Pipeline component for GPE entities and country meta data via a REST API {#component-example3}
|
||||
|
||||
This example shows the implementation of a pipeline component that fetches
|
||||
country meta data via the [REST Countries API](https://restcountries.eu), sets
|
||||
country meta data via the [REST Countries API](https://restcountries.com), sets
|
||||
entity annotations for countries and sets custom attributes on the `Doc` and
|
||||
`Span` – for example, the capital, latitude/longitude coordinates and even the
|
||||
country flag.
|
||||
|
@ -1495,7 +1495,7 @@ from spacy.tokens import Doc, Span, Token
|
|||
@Language.factory("rest_countries")
|
||||
class RESTCountriesComponent:
|
||||
def __init__(self, nlp, name, label="GPE"):
|
||||
r = requests.get("https://restcountries.eu/rest/v2/all")
|
||||
r = requests.get("https://restcountries.com/v2/all")
|
||||
r.raise_for_status() # make sure requests raises an error if it fails
|
||||
countries = r.json()
|
||||
# Convert API response to dict keyed by country name for easy lookup
|
||||
|
|
|
@ -1770,9 +1770,9 @@
|
|||
"title": "Applied Language Technology",
|
||||
"slogan": "NLP for newcomers using spaCy and Stanza",
|
||||
"description": "These learning materials provide an introduction to applied language technology for audiences who are unfamiliar with language technology and programming. The learning materials assume no previous knowledge of the Python programming language.",
|
||||
"url": "https://applied-language-technology.readthedocs.io/",
|
||||
"url": "https://applied-language-technology.mooc.fi",
|
||||
"image": "https://www.mv.helsinki.fi/home/thiippal/images/applt-preview.jpg",
|
||||
"thumb": "https://applied-language-technology.readthedocs.io/en/latest/_static/logo.png",
|
||||
"thumb": "https://www.mv.helsinki.fi/home/thiippal/images/applt-logo.png",
|
||||
"author": "Tuomo Hiippala",
|
||||
"author_links": {
|
||||
"twitter": "tuomo_h",
|
||||
|
|
|
@ -113,8 +113,7 @@ const QuickstartInstall = ({ id, title }) => {
|
|||
{
|
||||
id: 'venv',
|
||||
title: 'virtual env',
|
||||
help:
|
||||
'Use a virtual environment and install spaCy into a user directory',
|
||||
help: 'Use a virtual environment',
|
||||
},
|
||||
{
|
||||
id: 'train',
|
||||
|
@ -165,27 +164,51 @@ const QuickstartInstall = ({ id, title }) => {
|
|||
setters={setters}
|
||||
showDropdown={showDropdown}
|
||||
>
|
||||
<QS config="venv">python -m venv .env</QS>
|
||||
<QS config="venv" os="mac">
|
||||
<QS package="pip" config="venv">
|
||||
python -m venv .env
|
||||
</QS>
|
||||
<QS package="pip" config="venv" os="mac">
|
||||
source .env/bin/activate
|
||||
</QS>
|
||||
<QS config="venv" os="linux">
|
||||
<QS package="pip" config="venv" os="linux">
|
||||
source .env/bin/activate
|
||||
</QS>
|
||||
<QS config="venv" os="windows">
|
||||
<QS package="pip" config="venv" os="windows">
|
||||
.env\Scripts\activate
|
||||
</QS>
|
||||
<QS package="source" config="venv">
|
||||
python -m venv .env
|
||||
</QS>
|
||||
<QS package="source" config="venv" os="mac">
|
||||
source .env/bin/activate
|
||||
</QS>
|
||||
<QS package="source" config="venv" os="linux">
|
||||
source .env/bin/activate
|
||||
</QS>
|
||||
<QS package="source" config="venv" os="windows">
|
||||
.env\Scripts\activate
|
||||
</QS>
|
||||
<QS package="conda" config="venv">
|
||||
conda create -n venv
|
||||
</QS>
|
||||
<QS package="conda" config="venv">
|
||||
conda activate venv
|
||||
</QS>
|
||||
<QS package="pip">pip install -U pip setuptools wheel</QS>
|
||||
<QS package="source">pip install -U pip setuptools wheel</QS>
|
||||
<QS package="pip">
|
||||
pip install -U {pkg}
|
||||
{pipExtras && `[${pipExtras}]`}
|
||||
{pipExtras
|
||||
? `pip install -U '${pkg}[${pipExtras}]'`
|
||||
: `pip install -U ${pkg}`}
|
||||
{nightly ? ' --pre' : ''}
|
||||
</QS>
|
||||
<QS package="conda">conda install -c conda-forge spacy</QS>
|
||||
<QS package="conda" hardware="gpu">
|
||||
conda install -c conda-forge cupy
|
||||
</QS>
|
||||
<QS package="conda" config="train">
|
||||
conda install -c conda-forge spacy-transformers
|
||||
</QS>
|
||||
<QS package="source">
|
||||
git clone https://github.com/{repo}
|
||||
{nightly ? ` --branch ${DEFAULT_BRANCH}` : ''}
|
||||
|
@ -205,9 +228,6 @@ const QuickstartInstall = ({ id, title }) => {
|
|||
<QS config="train" package="conda" comment prompt={false}>
|
||||
# packages only available via pip
|
||||
</QS>
|
||||
<QS config="train" package="conda">
|
||||
pip install spacy-transformers
|
||||
</QS>
|
||||
<QS config="train" package="conda">
|
||||
pip install spacy-lookups-data
|
||||
</QS>
|
||||
|
|
Loading…
Reference in New Issue
Block a user