mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-03 13:14:11 +03:00
Merge pull request #10100 from svlandeg/feature/master_copy
Update develop with latest from master (2)
This commit is contained in:
commit
d2afdfefc2
|
@ -108,8 +108,8 @@ apple =
|
||||||
thinc-apple-ops>=0.0.4,<1.0.0
|
thinc-apple-ops>=0.0.4,<1.0.0
|
||||||
# Language tokenizers with external dependencies
|
# Language tokenizers with external dependencies
|
||||||
ja =
|
ja =
|
||||||
sudachipy>=0.4.9
|
sudachipy>=0.5.2,!=0.6.1
|
||||||
sudachidict_core>=20200330
|
sudachidict_core>=20211220
|
||||||
ko =
|
ko =
|
||||||
natto-py==0.9.0
|
natto-py==0.9.0
|
||||||
th =
|
th =
|
||||||
|
|
|
@ -1,3 +1,6 @@
|
||||||
|
from .errors import Errors
|
||||||
|
|
||||||
|
IOB_STRINGS = ("", "I", "O", "B")
|
||||||
|
|
||||||
IDS = {
|
IDS = {
|
||||||
"": NULL_ATTR,
|
"": NULL_ATTR,
|
||||||
|
@ -64,7 +67,6 @@ IDS = {
|
||||||
"FLAG61": FLAG61,
|
"FLAG61": FLAG61,
|
||||||
"FLAG62": FLAG62,
|
"FLAG62": FLAG62,
|
||||||
"FLAG63": FLAG63,
|
"FLAG63": FLAG63,
|
||||||
|
|
||||||
"ID": ID,
|
"ID": ID,
|
||||||
"ORTH": ORTH,
|
"ORTH": ORTH,
|
||||||
"LOWER": LOWER,
|
"LOWER": LOWER,
|
||||||
|
@ -72,7 +74,6 @@ IDS = {
|
||||||
"SHAPE": SHAPE,
|
"SHAPE": SHAPE,
|
||||||
"PREFIX": PREFIX,
|
"PREFIX": PREFIX,
|
||||||
"SUFFIX": SUFFIX,
|
"SUFFIX": SUFFIX,
|
||||||
|
|
||||||
"LENGTH": LENGTH,
|
"LENGTH": LENGTH,
|
||||||
"LEMMA": LEMMA,
|
"LEMMA": LEMMA,
|
||||||
"POS": POS,
|
"POS": POS,
|
||||||
|
@ -87,7 +88,7 @@ IDS = {
|
||||||
"SPACY": SPACY,
|
"SPACY": SPACY,
|
||||||
"LANG": LANG,
|
"LANG": LANG,
|
||||||
"MORPH": MORPH,
|
"MORPH": MORPH,
|
||||||
"IDX": IDX
|
"IDX": IDX,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -109,28 +110,66 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
||||||
"""
|
"""
|
||||||
inty_attrs = {}
|
inty_attrs = {}
|
||||||
if _do_deprecated:
|
if _do_deprecated:
|
||||||
if 'F' in stringy_attrs:
|
if "F" in stringy_attrs:
|
||||||
stringy_attrs["ORTH"] = stringy_attrs.pop("F")
|
stringy_attrs["ORTH"] = stringy_attrs.pop("F")
|
||||||
if 'L' in stringy_attrs:
|
if "L" in stringy_attrs:
|
||||||
stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
|
stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
|
||||||
if 'pos' in stringy_attrs:
|
if "pos" in stringy_attrs:
|
||||||
stringy_attrs["TAG"] = stringy_attrs.pop("pos")
|
stringy_attrs["TAG"] = stringy_attrs.pop("pos")
|
||||||
if 'morph' in stringy_attrs:
|
if "morph" in stringy_attrs:
|
||||||
morphs = stringy_attrs.pop('morph')
|
morphs = stringy_attrs.pop("morph")
|
||||||
if 'number' in stringy_attrs:
|
if "number" in stringy_attrs:
|
||||||
stringy_attrs.pop('number')
|
stringy_attrs.pop("number")
|
||||||
if 'tenspect' in stringy_attrs:
|
if "tenspect" in stringy_attrs:
|
||||||
stringy_attrs.pop('tenspect')
|
stringy_attrs.pop("tenspect")
|
||||||
morph_keys = [
|
morph_keys = [
|
||||||
'PunctType', 'PunctSide', 'Other', 'Degree', 'AdvType', 'Number',
|
"PunctType",
|
||||||
'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss',
|
"PunctSide",
|
||||||
'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType',
|
"Other",
|
||||||
'Gender', 'Mood', 'Negative', 'Tense', 'Voice', 'Abbr',
|
"Degree",
|
||||||
'Derivation', 'Echo', 'Foreign', 'NameType', 'NounType', 'NumForm',
|
"AdvType",
|
||||||
'NumValue', 'PartType', 'Polite', 'StyleVariant',
|
"Number",
|
||||||
'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
|
"VerbForm",
|
||||||
'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
|
"PronType",
|
||||||
'Polarity', 'PrepCase', 'Animacy' # U20
|
"Aspect",
|
||||||
|
"Tense",
|
||||||
|
"PartType",
|
||||||
|
"Poss",
|
||||||
|
"Hyph",
|
||||||
|
"ConjType",
|
||||||
|
"NumType",
|
||||||
|
"Foreign",
|
||||||
|
"VerbType",
|
||||||
|
"NounType",
|
||||||
|
"Gender",
|
||||||
|
"Mood",
|
||||||
|
"Negative",
|
||||||
|
"Tense",
|
||||||
|
"Voice",
|
||||||
|
"Abbr",
|
||||||
|
"Derivation",
|
||||||
|
"Echo",
|
||||||
|
"Foreign",
|
||||||
|
"NameType",
|
||||||
|
"NounType",
|
||||||
|
"NumForm",
|
||||||
|
"NumValue",
|
||||||
|
"PartType",
|
||||||
|
"Polite",
|
||||||
|
"StyleVariant",
|
||||||
|
"PronType",
|
||||||
|
"AdjType",
|
||||||
|
"Person",
|
||||||
|
"Variant",
|
||||||
|
"AdpType",
|
||||||
|
"Reflex",
|
||||||
|
"Negative",
|
||||||
|
"Mood",
|
||||||
|
"Aspect",
|
||||||
|
"Case",
|
||||||
|
"Polarity",
|
||||||
|
"PrepCase",
|
||||||
|
"Animacy", # U20
|
||||||
]
|
]
|
||||||
for key in morph_keys:
|
for key in morph_keys:
|
||||||
if key in stringy_attrs:
|
if key in stringy_attrs:
|
||||||
|
@ -142,8 +181,13 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
||||||
for name, value in stringy_attrs.items():
|
for name, value in stringy_attrs.items():
|
||||||
int_key = intify_attr(name)
|
int_key = intify_attr(name)
|
||||||
if int_key is not None:
|
if int_key is not None:
|
||||||
|
if int_key == ENT_IOB:
|
||||||
|
if value in IOB_STRINGS:
|
||||||
|
value = IOB_STRINGS.index(value)
|
||||||
|
elif isinstance(value, str):
|
||||||
|
raise ValueError(Errors.E1025.format(value=value))
|
||||||
if strings_map is not None and isinstance(value, str):
|
if strings_map is not None and isinstance(value, str):
|
||||||
if hasattr(strings_map, 'add'):
|
if hasattr(strings_map, "add"):
|
||||||
value = strings_map.add(value)
|
value = strings_map.add(value)
|
||||||
else:
|
else:
|
||||||
value = strings_map[value]
|
value = strings_map[value]
|
||||||
|
|
|
@ -14,7 +14,7 @@ from ..training.initialize import get_sourced_components
|
||||||
from ..schemas import ConfigSchemaTraining
|
from ..schemas import ConfigSchemaTraining
|
||||||
from ..pipeline._parser_internals import nonproj
|
from ..pipeline._parser_internals import nonproj
|
||||||
from ..pipeline._parser_internals.nonproj import DELIMITER
|
from ..pipeline._parser_internals.nonproj import DELIMITER
|
||||||
from ..pipeline import Morphologizer
|
from ..pipeline import Morphologizer, SpanCategorizer
|
||||||
from ..morphology import Morphology
|
from ..morphology import Morphology
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..util import registry, resolve_dot_names
|
from ..util import registry, resolve_dot_names
|
||||||
|
@ -699,8 +699,34 @@ def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
|
||||||
return count
|
return count
|
||||||
|
|
||||||
|
|
||||||
def _get_labels_from_model(nlp: Language, pipe_name: str) -> Set[str]:
|
def _get_labels_from_model(
|
||||||
if pipe_name not in nlp.pipe_names:
|
nlp: Language, factory_name: str
|
||||||
return set()
|
) -> Set[str]:
|
||||||
|
pipe_names = [
|
||||||
|
pipe_name
|
||||||
|
for pipe_name in nlp.pipe_names
|
||||||
|
if nlp.get_pipe_meta(pipe_name).factory == factory_name
|
||||||
|
]
|
||||||
|
labels: Set[str] = set()
|
||||||
|
for pipe_name in pipe_names:
|
||||||
pipe = nlp.get_pipe(pipe_name)
|
pipe = nlp.get_pipe(pipe_name)
|
||||||
return set(pipe.labels)
|
labels.update(pipe.labels)
|
||||||
|
return labels
|
||||||
|
|
||||||
|
|
||||||
|
def _get_labels_from_spancat(
|
||||||
|
nlp: Language
|
||||||
|
) -> Dict[str, Set[str]]:
|
||||||
|
pipe_names = [
|
||||||
|
pipe_name
|
||||||
|
for pipe_name in nlp.pipe_names
|
||||||
|
if nlp.get_pipe_meta(pipe_name).factory == "spancat"
|
||||||
|
]
|
||||||
|
labels: Dict[str, Set[str]] = {}
|
||||||
|
for pipe_name in pipe_names:
|
||||||
|
pipe = nlp.get_pipe(pipe_name)
|
||||||
|
assert isinstance(pipe, SpanCategorizer)
|
||||||
|
if pipe.key not in labels:
|
||||||
|
labels[pipe.key] = set()
|
||||||
|
labels[pipe.key].update(pipe.labels)
|
||||||
|
return labels
|
||||||
|
|
|
@ -18,7 +18,7 @@ DEFAULT_LABEL_COLORS = {
|
||||||
"LOC": "#ff9561",
|
"LOC": "#ff9561",
|
||||||
"PERSON": "#aa9cfc",
|
"PERSON": "#aa9cfc",
|
||||||
"NORP": "#c887fb",
|
"NORP": "#c887fb",
|
||||||
"FACILITY": "#9cc9cc",
|
"FAC": "#9cc9cc",
|
||||||
"EVENT": "#ffeb80",
|
"EVENT": "#ffeb80",
|
||||||
"LAW": "#ff8197",
|
"LAW": "#ff8197",
|
||||||
"LANGUAGE": "#ff8197",
|
"LANGUAGE": "#ff8197",
|
||||||
|
|
|
@ -888,9 +888,12 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
|
E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
|
||||||
"Non-UD tags should use the `tag` property.")
|
"Non-UD tags should use the `tag` property.")
|
||||||
E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
|
E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
|
||||||
E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't exist.")
|
E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't "
|
||||||
E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler patterns.")
|
"exist.")
|
||||||
|
E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler "
|
||||||
|
"patterns.")
|
||||||
|
E1025 = ("Cannot intify the value '{value}' as an IOB string. The only "
|
||||||
|
"supported values are: 'I', 'O', 'B' and ''")
|
||||||
|
|
||||||
|
|
||||||
# Deprecated model shortcuts, only used in errors and warnings
|
# Deprecated model shortcuts, only used in errors and warnings
|
||||||
|
|
|
@ -1285,9 +1285,9 @@ class Language:
|
||||||
)
|
)
|
||||||
except IOError:
|
except IOError:
|
||||||
raise IOError(Errors.E884.format(vectors=I["vectors"]))
|
raise IOError(Errors.E884.format(vectors=I["vectors"]))
|
||||||
if self.vocab.vectors.data.shape[1] >= 1:
|
if self.vocab.vectors.shape[1] >= 1:
|
||||||
ops = get_current_ops()
|
ops = get_current_ops()
|
||||||
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
self.vocab.vectors.to_ops(ops)
|
||||||
if hasattr(self.tokenizer, "initialize"):
|
if hasattr(self.tokenizer, "initialize"):
|
||||||
tok_settings = validate_init_settings(
|
tok_settings = validate_init_settings(
|
||||||
self.tokenizer.initialize, # type: ignore[union-attr]
|
self.tokenizer.initialize, # type: ignore[union-attr]
|
||||||
|
@ -1332,8 +1332,8 @@ class Language:
|
||||||
DOCS: https://spacy.io/api/language#resume_training
|
DOCS: https://spacy.io/api/language#resume_training
|
||||||
"""
|
"""
|
||||||
ops = get_current_ops()
|
ops = get_current_ops()
|
||||||
if self.vocab.vectors.data.shape[1] >= 1:
|
if self.vocab.vectors.shape[1] >= 1:
|
||||||
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
self.vocab.vectors.to_ops(ops)
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if hasattr(proc, "_rehearsal_model"):
|
if hasattr(proc, "_rehearsal_model"):
|
||||||
proc._rehearsal_model = deepcopy(proc.model) # type: ignore[attr-defined]
|
proc._rehearsal_model = deepcopy(proc.model) # type: ignore[attr-defined]
|
||||||
|
|
|
@ -130,7 +130,9 @@ cdef class Lexeme:
|
||||||
return 0.0
|
return 0.0
|
||||||
vector = self.vector
|
vector = self.vector
|
||||||
xp = get_array_module(vector)
|
xp = get_array_module(vector)
|
||||||
return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
|
result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
|
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
|
||||||
|
return result.item()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def has_vector(self):
|
def has_vector(self):
|
||||||
|
|
|
@ -18,7 +18,7 @@ from ..tokens.doc cimport Doc, get_token_attr_for_matcher
|
||||||
from ..tokens.span cimport Span
|
from ..tokens.span cimport Span
|
||||||
from ..tokens.token cimport Token
|
from ..tokens.token cimport Token
|
||||||
from ..tokens.morphanalysis cimport MorphAnalysis
|
from ..tokens.morphanalysis cimport MorphAnalysis
|
||||||
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH
|
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB
|
||||||
|
|
||||||
from ..schemas import validate_token_pattern
|
from ..schemas import validate_token_pattern
|
||||||
from ..errors import Errors, MatchPatternError, Warnings
|
from ..errors import Errors, MatchPatternError, Warnings
|
||||||
|
@ -798,6 +798,9 @@ def _get_attr_values(spec, string_store):
|
||||||
attr = "SENT_START"
|
attr = "SENT_START"
|
||||||
attr = IDS.get(attr)
|
attr = IDS.get(attr)
|
||||||
if isinstance(value, str):
|
if isinstance(value, str):
|
||||||
|
if attr == ENT_IOB and value in Token.iob_strings():
|
||||||
|
value = Token.iob_strings().index(value)
|
||||||
|
else:
|
||||||
value = string_store.add(value)
|
value = string_store.add(value)
|
||||||
elif isinstance(value, bool):
|
elif isinstance(value, bool):
|
||||||
value = int(value)
|
value = int(value)
|
||||||
|
|
|
@ -23,7 +23,7 @@ def create_pretrain_vectors(
|
||||||
maxout_pieces: int, hidden_size: int, loss: str
|
maxout_pieces: int, hidden_size: int, loss: str
|
||||||
) -> Callable[["Vocab", Model], Model]:
|
) -> Callable[["Vocab", Model], Model]:
|
||||||
def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model:
|
def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model:
|
||||||
if vocab.vectors.data.shape[1] == 0:
|
if vocab.vectors.shape[1] == 0:
|
||||||
raise ValueError(Errors.E875)
|
raise ValueError(Errors.E875)
|
||||||
model = build_cloze_multi_task_model(
|
model = build_cloze_multi_task_model(
|
||||||
vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
|
vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
|
||||||
|
@ -116,7 +116,7 @@ def build_multi_task_model(
|
||||||
def build_cloze_multi_task_model(
|
def build_cloze_multi_task_model(
|
||||||
vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int
|
vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int
|
||||||
) -> Model:
|
) -> Model:
|
||||||
nO = vocab.vectors.data.shape[1]
|
nO = vocab.vectors.shape[1]
|
||||||
output_layer = chain(
|
output_layer = chain(
|
||||||
cast(Model[List["Floats2d"], Floats2d], list2array()),
|
cast(Model[List["Floats2d"], Floats2d], list2array()),
|
||||||
Maxout(
|
Maxout(
|
||||||
|
|
|
@ -94,7 +94,7 @@ def init(
|
||||||
nM = model.get_dim("nM") if model.has_dim("nM") else None
|
nM = model.get_dim("nM") if model.has_dim("nM") else None
|
||||||
nO = model.get_dim("nO") if model.has_dim("nO") else None
|
nO = model.get_dim("nO") if model.has_dim("nO") else None
|
||||||
if X is not None and len(X):
|
if X is not None and len(X):
|
||||||
nM = X[0].vocab.vectors.data.shape[1]
|
nM = X[0].vocab.vectors.shape[1]
|
||||||
if Y is not None:
|
if Y is not None:
|
||||||
nO = Y.data.shape[1]
|
nO = Y.data.shape[1]
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
|
from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
|
||||||
from typing import Iterable, TypeVar, TYPE_CHECKING
|
from typing import Iterable, TypeVar, TYPE_CHECKING
|
||||||
|
from .compat import Literal
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pydantic import BaseModel, Field, ValidationError, validator, create_model
|
from pydantic import BaseModel, Field, ValidationError, validator, create_model
|
||||||
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
|
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
|
||||||
|
@ -209,6 +210,7 @@ NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat]
|
||||||
UnderscoreValue = Union[
|
UnderscoreValue = Union[
|
||||||
TokenPatternString, TokenPatternNumber, str, int, float, list, bool
|
TokenPatternString, TokenPatternNumber, str, int, float, list, bool
|
||||||
]
|
]
|
||||||
|
IobValue = Literal["", "I", "O", "B", 0, 1, 2, 3]
|
||||||
|
|
||||||
|
|
||||||
class TokenPattern(BaseModel):
|
class TokenPattern(BaseModel):
|
||||||
|
@ -222,6 +224,7 @@ class TokenPattern(BaseModel):
|
||||||
lemma: Optional[StringValue] = None
|
lemma: Optional[StringValue] = None
|
||||||
shape: Optional[StringValue] = None
|
shape: Optional[StringValue] = None
|
||||||
ent_type: Optional[StringValue] = None
|
ent_type: Optional[StringValue] = None
|
||||||
|
ent_iob: Optional[IobValue] = None
|
||||||
ent_id: Optional[StringValue] = None
|
ent_id: Optional[StringValue] = None
|
||||||
ent_kb_id: Optional[StringValue] = None
|
ent_kb_id: Optional[StringValue] = None
|
||||||
norm: Optional[StringValue] = None
|
norm: Optional[StringValue] = None
|
||||||
|
|
|
@ -567,6 +567,7 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
||||||
"Merging the docs is fun.",
|
"Merging the docs is fun.",
|
||||||
"",
|
"",
|
||||||
"They don't think alike. ",
|
"They don't think alike. ",
|
||||||
|
"",
|
||||||
"Another doc.",
|
"Another doc.",
|
||||||
]
|
]
|
||||||
en_texts_without_empty = [t for t in en_texts if len(t)]
|
en_texts_without_empty = [t for t in en_texts if len(t)]
|
||||||
|
@ -574,9 +575,9 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
||||||
en_docs = [en_tokenizer(text) for text in en_texts]
|
en_docs = [en_tokenizer(text) for text in en_texts]
|
||||||
en_docs[0].spans["group"] = [en_docs[0][1:4]]
|
en_docs[0].spans["group"] = [en_docs[0][1:4]]
|
||||||
en_docs[2].spans["group"] = [en_docs[2][1:4]]
|
en_docs[2].spans["group"] = [en_docs[2][1:4]]
|
||||||
en_docs[3].spans["group"] = [en_docs[3][0:1]]
|
en_docs[4].spans["group"] = [en_docs[4][0:1]]
|
||||||
span_group_texts = sorted(
|
span_group_texts = sorted(
|
||||||
[en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text]
|
[en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[4][0:1].text]
|
||||||
)
|
)
|
||||||
de_doc = de_tokenizer(de_text)
|
de_doc = de_tokenizer(de_text)
|
||||||
Token.set_extension("is_ambiguous", default=False)
|
Token.set_extension("is_ambiguous", default=False)
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
from spacy.attrs import intify_attrs, ENT_IOB
|
||||||
|
|
||||||
from spacy.attrs import IS_ALPHA, LEMMA, NORM, ORTH, intify_attrs
|
from spacy.attrs import IS_ALPHA, LEMMA, NORM, ORTH, intify_attrs
|
||||||
from spacy.lang.en.stop_words import STOP_WORDS
|
from spacy.lang.en.stop_words import STOP_WORDS
|
||||||
|
@ -33,6 +34,38 @@ def test_attrs_do_deprecated(text):
|
||||||
assert int_attrs == {ORTH: 10, IS_ALPHA: True}
|
assert int_attrs == {ORTH: 10, IS_ALPHA: True}
|
||||||
|
|
||||||
|
|
||||||
|
def test_attrs_ent_iob_intify():
|
||||||
|
int_attrs = intify_attrs({"ENT_IOB": ""})
|
||||||
|
assert int_attrs == {ENT_IOB: 0}
|
||||||
|
|
||||||
|
int_attrs = intify_attrs({"ENT_IOB": "I"})
|
||||||
|
assert int_attrs == {ENT_IOB: 1}
|
||||||
|
|
||||||
|
int_attrs = intify_attrs({"ENT_IOB": "O"})
|
||||||
|
assert int_attrs == {ENT_IOB: 2}
|
||||||
|
|
||||||
|
int_attrs = intify_attrs({"ENT_IOB": "B"})
|
||||||
|
assert int_attrs == {ENT_IOB: 3}
|
||||||
|
|
||||||
|
int_attrs = intify_attrs({ENT_IOB: ""})
|
||||||
|
assert int_attrs == {ENT_IOB: 0}
|
||||||
|
|
||||||
|
int_attrs = intify_attrs({ENT_IOB: "I"})
|
||||||
|
assert int_attrs == {ENT_IOB: 1}
|
||||||
|
|
||||||
|
int_attrs = intify_attrs({ENT_IOB: "O"})
|
||||||
|
assert int_attrs == {ENT_IOB: 2}
|
||||||
|
|
||||||
|
int_attrs = intify_attrs({ENT_IOB: "B"})
|
||||||
|
assert int_attrs == {ENT_IOB: 3}
|
||||||
|
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
int_attrs = intify_attrs({"ENT_IOB": "XX"})
|
||||||
|
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
int_attrs = intify_attrs({ENT_IOB: "XX"})
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("text,match", [(",", True), (" ", False), ("a", False)])
|
@pytest.mark.parametrize("text,match", [(",", True), (" ", False), ("a", False)])
|
||||||
def test_lex_attrs_is_punct(text, match):
|
def test_lex_attrs_is_punct(text, match):
|
||||||
assert is_punct(text) == match
|
assert is_punct(text) == match
|
||||||
|
|
|
@ -642,3 +642,30 @@ def test_matcher_no_zero_length(en_vocab):
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]])
|
matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]])
|
||||||
assert len(matcher(doc)) == 0
|
assert len(matcher(doc)) == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_matcher_ent_iob_key(en_vocab):
|
||||||
|
"""Test that patterns with ent_iob works correctly."""
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
matcher.add("Rule", [[{"ENT_IOB": "I"}]])
|
||||||
|
doc1 = Doc(en_vocab, words=["I", "visited", "New", "York", "and", "California"])
|
||||||
|
doc1.ents = [Span(doc1, 2, 4, label="GPE"), Span(doc1, 5, 6, label="GPE")]
|
||||||
|
doc2 = Doc(en_vocab, words=["I", "visited", "my", "friend", "Alicia"])
|
||||||
|
doc2.ents = [Span(doc2, 4, 5, label="PERSON")]
|
||||||
|
matches1 = [doc1[start:end].text for _, start, end in matcher(doc1)]
|
||||||
|
matches2 = [doc2[start:end].text for _, start, end in matcher(doc2)]
|
||||||
|
assert len(matches1) == 1
|
||||||
|
assert matches1[0] == "York"
|
||||||
|
assert len(matches2) == 0
|
||||||
|
|
||||||
|
matcher = Matcher(en_vocab) # Test iob pattern with operators
|
||||||
|
matcher.add("Rule", [[{"ENT_IOB": "I", "OP": "+"}]])
|
||||||
|
doc = Doc(
|
||||||
|
en_vocab, words=["I", "visited", "my", "friend", "Anna", "Maria", "Esperanza"]
|
||||||
|
)
|
||||||
|
doc.ents = [Span(doc, 4, 7, label="PERSON")]
|
||||||
|
matches = [doc[start:end].text for _, start, end in matcher(doc)]
|
||||||
|
assert len(matches) == 3
|
||||||
|
assert matches[0] == "Maria"
|
||||||
|
assert matches[1] == "Maria Esperanza"
|
||||||
|
assert matches[2] == "Esperanza"
|
||||||
|
|
|
@ -12,6 +12,7 @@ TEST_PATTERNS = [
|
||||||
([{"IS_PUNCT": True, "OP": "$"}], 1, 1),
|
([{"IS_PUNCT": True, "OP": "$"}], 1, 1),
|
||||||
([{"_": "foo"}], 1, 1),
|
([{"_": "foo"}], 1, 1),
|
||||||
('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1),
|
('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1),
|
||||||
|
([{"ENT_IOB": "foo"}], 1, 1),
|
||||||
([1, 2, 3], 3, 1),
|
([1, 2, 3], 3, 1),
|
||||||
# Bad patterns flagged outside of Matcher
|
# Bad patterns flagged outside of Matcher
|
||||||
([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0), # prev: (1, 0)
|
([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0), # prev: (1, 0)
|
||||||
|
|
|
@ -79,7 +79,8 @@ def test_explicit_labels():
|
||||||
nlp.initialize()
|
nlp.initialize()
|
||||||
assert spancat.labels == ("PERSON", "LOC")
|
assert spancat.labels == ("PERSON", "LOC")
|
||||||
|
|
||||||
|
#TODO figure out why this is flaky
|
||||||
|
@pytest.mark.skip(reason="Test is unreliable for unknown reason")
|
||||||
def test_doc_gc():
|
def test_doc_gc():
|
||||||
# If the Doc object is garbage collected, the spans won't be functional afterwards
|
# If the Doc object is garbage collected, the spans won't be functional afterwards
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
|
@ -97,6 +98,7 @@ def test_doc_gc():
|
||||||
assert isinstance(spangroups, SpanGroups)
|
assert isinstance(spangroups, SpanGroups)
|
||||||
for key, spangroup in spangroups.items():
|
for key, spangroup in spangroups.items():
|
||||||
assert isinstance(spangroup, SpanGroup)
|
assert isinstance(spangroup, SpanGroup)
|
||||||
|
# XXX This fails with length 0 sometimes
|
||||||
assert len(spangroup) > 0
|
assert len(spangroup) > 0
|
||||||
with pytest.raises(RuntimeError):
|
with pytest.raises(RuntimeError):
|
||||||
span = spangroup[0]
|
span = spangroup[0]
|
||||||
|
|
|
@ -12,6 +12,8 @@ from spacy.cli._util import is_subpath_of, load_project_config
|
||||||
from spacy.cli._util import parse_config_overrides, string_to_list
|
from spacy.cli._util import parse_config_overrides, string_to_list
|
||||||
from spacy.cli._util import substitute_project_variables
|
from spacy.cli._util import substitute_project_variables
|
||||||
from spacy.cli._util import validate_project_commands
|
from spacy.cli._util import validate_project_commands
|
||||||
|
from spacy.cli.debug_data import _get_labels_from_model
|
||||||
|
from spacy.cli.debug_data import _get_labels_from_spancat
|
||||||
from spacy.cli.download import get_compatibility, get_version
|
from spacy.cli.download import get_compatibility, get_version
|
||||||
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
|
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
|
||||||
from spacy.cli.package import get_third_party_dependencies
|
from spacy.cli.package import get_third_party_dependencies
|
||||||
|
@ -665,3 +667,28 @@ def test_get_third_party_dependencies():
|
||||||
)
|
)
|
||||||
def test_is_subpath_of(parent, child, expected):
|
def test_is_subpath_of(parent, child, expected):
|
||||||
assert is_subpath_of(parent, child) == expected
|
assert is_subpath_of(parent, child) == expected
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"factory_name,pipe_name",
|
||||||
|
[
|
||||||
|
("ner", "ner"),
|
||||||
|
("ner", "my_ner"),
|
||||||
|
("spancat", "spancat"),
|
||||||
|
("spancat", "my_spancat"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_get_labels_from_model(factory_name, pipe_name):
|
||||||
|
labels = ("A", "B")
|
||||||
|
|
||||||
|
nlp = English()
|
||||||
|
pipe = nlp.add_pipe(factory_name, name=pipe_name)
|
||||||
|
for label in labels:
|
||||||
|
pipe.add_label(label)
|
||||||
|
nlp.initialize()
|
||||||
|
assert nlp.get_pipe(pipe_name).labels == labels
|
||||||
|
if factory_name == "spancat":
|
||||||
|
assert _get_labels_from_spancat(nlp)[pipe.key] == set(labels)
|
||||||
|
else:
|
||||||
|
assert _get_labels_from_model(nlp, factory_name) == set(labels)
|
||||||
|
|
|
@ -35,6 +35,7 @@ def test_vectors_similarity_LL(vocab, vectors):
|
||||||
assert lex1.vector_norm != 0
|
assert lex1.vector_norm != 0
|
||||||
assert lex2.vector_norm != 0
|
assert lex2.vector_norm != 0
|
||||||
assert lex1.vector[0] != lex2.vector[0] and lex1.vector[1] != lex2.vector[1]
|
assert lex1.vector[0] != lex2.vector[0] and lex1.vector[1] != lex2.vector[1]
|
||||||
|
assert isinstance(lex1.similarity(lex2), float)
|
||||||
assert numpy.isclose(lex1.similarity(lex2), get_cosine(vec1, vec2))
|
assert numpy.isclose(lex1.similarity(lex2), get_cosine(vec1, vec2))
|
||||||
assert numpy.isclose(lex2.similarity(lex2), lex1.similarity(lex1))
|
assert numpy.isclose(lex2.similarity(lex2), lex1.similarity(lex1))
|
||||||
|
|
||||||
|
@ -47,25 +48,46 @@ def test_vectors_similarity_TT(vocab, vectors):
|
||||||
assert doc[0].vector_norm != 0
|
assert doc[0].vector_norm != 0
|
||||||
assert doc[1].vector_norm != 0
|
assert doc[1].vector_norm != 0
|
||||||
assert doc[0].vector[0] != doc[1].vector[0] and doc[0].vector[1] != doc[1].vector[1]
|
assert doc[0].vector[0] != doc[1].vector[0] and doc[0].vector[1] != doc[1].vector[1]
|
||||||
|
assert isinstance(doc[0].similarity(doc[1]), float)
|
||||||
assert numpy.isclose(doc[0].similarity(doc[1]), get_cosine(vec1, vec2))
|
assert numpy.isclose(doc[0].similarity(doc[1]), get_cosine(vec1, vec2))
|
||||||
assert numpy.isclose(doc[1].similarity(doc[0]), doc[0].similarity(doc[1]))
|
assert numpy.isclose(doc[1].similarity(doc[0]), doc[0].similarity(doc[1]))
|
||||||
|
|
||||||
|
|
||||||
|
def test_vectors_similarity_SS(vocab, vectors):
|
||||||
|
[(word1, vec1), (word2, vec2)] = vectors
|
||||||
|
doc = Doc(vocab, words=[word1, word2])
|
||||||
|
assert isinstance(doc[0:1].similarity(doc[0:2]), float)
|
||||||
|
assert doc[0:1].similarity(doc[0:2]) == doc[0:2].similarity(doc[0:1])
|
||||||
|
|
||||||
|
|
||||||
|
def test_vectors_similarity_DD(vocab, vectors):
|
||||||
|
[(word1, vec1), (word2, vec2)] = vectors
|
||||||
|
doc1 = Doc(vocab, words=[word1, word2])
|
||||||
|
doc2 = Doc(vocab, words=[word2, word1])
|
||||||
|
assert isinstance(doc1.similarity(doc2), float)
|
||||||
|
assert doc1.similarity(doc2) == doc2.similarity(doc1)
|
||||||
|
|
||||||
|
|
||||||
def test_vectors_similarity_TD(vocab, vectors):
|
def test_vectors_similarity_TD(vocab, vectors):
|
||||||
[(word1, vec1), (word2, vec2)] = vectors
|
[(word1, vec1), (word2, vec2)] = vectors
|
||||||
doc = Doc(vocab, words=[word1, word2])
|
doc = Doc(vocab, words=[word1, word2])
|
||||||
with pytest.warns(UserWarning):
|
with pytest.warns(UserWarning):
|
||||||
|
assert isinstance(doc.similarity(doc[0]), float)
|
||||||
|
assert isinstance(doc[0].similarity(doc), float)
|
||||||
assert doc.similarity(doc[0]) == doc[0].similarity(doc)
|
assert doc.similarity(doc[0]) == doc[0].similarity(doc)
|
||||||
|
|
||||||
|
|
||||||
def test_vectors_similarity_DS(vocab, vectors):
|
|
||||||
[(word1, vec1), (word2, vec2)] = vectors
|
|
||||||
doc = Doc(vocab, words=[word1, word2])
|
|
||||||
assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
|
|
||||||
|
|
||||||
|
|
||||||
def test_vectors_similarity_TS(vocab, vectors):
|
def test_vectors_similarity_TS(vocab, vectors):
|
||||||
[(word1, vec1), (word2, vec2)] = vectors
|
[(word1, vec1), (word2, vec2)] = vectors
|
||||||
doc = Doc(vocab, words=[word1, word2])
|
doc = Doc(vocab, words=[word1, word2])
|
||||||
with pytest.warns(UserWarning):
|
with pytest.warns(UserWarning):
|
||||||
|
assert isinstance(doc[:2].similarity(doc[0]), float)
|
||||||
|
assert isinstance(doc[0].similarity(doc[-2]), float)
|
||||||
assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2])
|
assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2])
|
||||||
|
|
||||||
|
|
||||||
|
def test_vectors_similarity_DS(vocab, vectors):
|
||||||
|
[(word1, vec1), (word2, vec2)] = vectors
|
||||||
|
doc = Doc(vocab, words=[word1, word2])
|
||||||
|
assert isinstance(doc.similarity(doc[:2]), float)
|
||||||
|
assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
|
||||||
|
|
|
@ -421,7 +421,7 @@ def test_vector_is_oov():
|
||||||
def test_init_vectors_unset():
|
def test_init_vectors_unset():
|
||||||
v = Vectors(shape=(10, 10))
|
v = Vectors(shape=(10, 10))
|
||||||
assert v.is_full is False
|
assert v.is_full is False
|
||||||
assert v.data.shape == (10, 10)
|
assert v.shape == (10, 10)
|
||||||
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
v = Vectors(shape=(10, 10), mode="floret")
|
v = Vectors(shape=(10, 10), mode="floret")
|
||||||
|
@ -514,7 +514,7 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str):
|
||||||
# rows: 2 rows per ngram
|
# rows: 2 rows per ngram
|
||||||
rows = OPS.xp.asarray(
|
rows = OPS.xp.asarray(
|
||||||
[
|
[
|
||||||
h % nlp.vocab.vectors.data.shape[0]
|
h % nlp.vocab.vectors.shape[0]
|
||||||
for ngram in ngrams
|
for ngram in ngrams
|
||||||
for h in nlp.vocab.vectors._get_ngram_hashes(ngram)
|
for h in nlp.vocab.vectors._get_ngram_hashes(ngram)
|
||||||
],
|
],
|
||||||
|
@ -544,17 +544,17 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str):
|
||||||
# an empty key returns 0s
|
# an empty key returns 0s
|
||||||
assert_equal(
|
assert_equal(
|
||||||
OPS.to_numpy(nlp.vocab[""].vector),
|
OPS.to_numpy(nlp.vocab[""].vector),
|
||||||
numpy.zeros((nlp.vocab.vectors.data.shape[0],)),
|
numpy.zeros((nlp.vocab.vectors.shape[0],)),
|
||||||
)
|
)
|
||||||
# an empty batch returns 0s
|
# an empty batch returns 0s
|
||||||
assert_equal(
|
assert_equal(
|
||||||
OPS.to_numpy(nlp.vocab.vectors.get_batch([""])),
|
OPS.to_numpy(nlp.vocab.vectors.get_batch([""])),
|
||||||
numpy.zeros((1, nlp.vocab.vectors.data.shape[0])),
|
numpy.zeros((1, nlp.vocab.vectors.shape[0])),
|
||||||
)
|
)
|
||||||
# an empty key within a batch returns 0s
|
# an empty key within a batch returns 0s
|
||||||
assert_equal(
|
assert_equal(
|
||||||
OPS.to_numpy(nlp.vocab.vectors.get_batch(["a", "", "b"])[1]),
|
OPS.to_numpy(nlp.vocab.vectors.get_batch(["a", "", "b"])[1]),
|
||||||
numpy.zeros((nlp.vocab.vectors.data.shape[0],)),
|
numpy.zeros((nlp.vocab.vectors.shape[0],)),
|
||||||
)
|
)
|
||||||
|
|
||||||
# the loaded ngram vector table cannot be modified
|
# the loaded ngram vector table cannot be modified
|
||||||
|
|
|
@ -616,7 +616,7 @@ cdef class Doc:
|
||||||
"""
|
"""
|
||||||
if "has_vector" in self.user_hooks:
|
if "has_vector" in self.user_hooks:
|
||||||
return self.user_hooks["has_vector"](self)
|
return self.user_hooks["has_vector"](self)
|
||||||
elif self.vocab.vectors.data.size:
|
elif self.vocab.vectors.size:
|
||||||
return True
|
return True
|
||||||
elif self.tensor.size:
|
elif self.tensor.size:
|
||||||
return True
|
return True
|
||||||
|
@ -641,7 +641,7 @@ cdef class Doc:
|
||||||
if not len(self):
|
if not len(self):
|
||||||
self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f")
|
self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f")
|
||||||
return self._vector
|
return self._vector
|
||||||
elif self.vocab.vectors.data.size > 0:
|
elif self.vocab.vectors.size > 0:
|
||||||
self._vector = sum(t.vector for t in self) / len(self)
|
self._vector = sum(t.vector for t in self) / len(self)
|
||||||
return self._vector
|
return self._vector
|
||||||
elif self.tensor.size > 0:
|
elif self.tensor.size > 0:
|
||||||
|
@ -1183,7 +1183,7 @@ cdef class Doc:
|
||||||
token_offset = -1
|
token_offset = -1
|
||||||
for doc in docs[:-1]:
|
for doc in docs[:-1]:
|
||||||
token_offset += len(doc)
|
token_offset += len(doc)
|
||||||
if not (len(doc) > 0 and doc[-1].is_space):
|
if len(doc) > 0 and not doc[-1].is_space:
|
||||||
concat_spaces[token_offset] = True
|
concat_spaces[token_offset] = True
|
||||||
|
|
||||||
concat_array = numpy.concatenate(arrays)
|
concat_array = numpy.concatenate(arrays)
|
||||||
|
|
|
@ -352,7 +352,9 @@ cdef class Span:
|
||||||
return 0.0
|
return 0.0
|
||||||
vector = self.vector
|
vector = self.vector
|
||||||
xp = get_array_module(vector)
|
xp = get_array_module(vector)
|
||||||
return xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
|
result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
|
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
|
||||||
|
return result.item()
|
||||||
|
|
||||||
cpdef np.ndarray to_array(self, object py_attr_ids):
|
cpdef np.ndarray to_array(self, object py_attr_ids):
|
||||||
"""Given a list of M attribute IDs, export the tokens to a numpy
|
"""Given a list of M attribute IDs, export the tokens to a numpy
|
||||||
|
@ -485,7 +487,7 @@ cdef class Span:
|
||||||
"""
|
"""
|
||||||
if "has_vector" in self.doc.user_span_hooks:
|
if "has_vector" in self.doc.user_span_hooks:
|
||||||
return self.doc.user_span_hooks["has_vector"](self)
|
return self.doc.user_span_hooks["has_vector"](self)
|
||||||
elif self.vocab.vectors.data.size > 0:
|
elif self.vocab.vectors.size > 0:
|
||||||
return any(token.has_vector for token in self)
|
return any(token.has_vector for token in self)
|
||||||
elif self.doc.tensor.size > 0:
|
elif self.doc.tensor.size > 0:
|
||||||
return True
|
return True
|
||||||
|
|
|
@ -20,6 +20,7 @@ from .doc cimport set_children_from_heads
|
||||||
|
|
||||||
from .. import parts_of_speech
|
from .. import parts_of_speech
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
|
from ..attrs import IOB_STRINGS
|
||||||
from .underscore import Underscore, get_ext_args
|
from .underscore import Underscore, get_ext_args
|
||||||
|
|
||||||
|
|
||||||
|
@ -209,7 +210,9 @@ cdef class Token:
|
||||||
return 0.0
|
return 0.0
|
||||||
vector = self.vector
|
vector = self.vector
|
||||||
xp = get_array_module(vector)
|
xp = get_array_module(vector)
|
||||||
return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
|
result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
|
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
|
||||||
|
return result.item()
|
||||||
|
|
||||||
def has_morph(self):
|
def has_morph(self):
|
||||||
"""Check whether the token has annotated morph information.
|
"""Check whether the token has annotated morph information.
|
||||||
|
@ -743,7 +746,7 @@ cdef class Token:
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def iob_strings(cls):
|
def iob_strings(cls):
|
||||||
return ("", "I", "O", "B")
|
return IOB_STRINGS
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def ent_iob_(self):
|
def ent_iob_(self):
|
||||||
|
|
|
@ -164,7 +164,7 @@ def load_vectors_into_model(
|
||||||
len(vectors_nlp.vocab.vectors.keys()) == 0
|
len(vectors_nlp.vocab.vectors.keys()) == 0
|
||||||
and vectors_nlp.vocab.vectors.mode != VectorsMode.floret
|
and vectors_nlp.vocab.vectors.mode != VectorsMode.floret
|
||||||
) or (
|
) or (
|
||||||
vectors_nlp.vocab.vectors.data.shape[0] == 0
|
vectors_nlp.vocab.vectors.shape[0] == 0
|
||||||
and vectors_nlp.vocab.vectors.mode == VectorsMode.floret
|
and vectors_nlp.vocab.vectors.mode == VectorsMode.floret
|
||||||
):
|
):
|
||||||
logger.warning(Warnings.W112.format(name=name))
|
logger.warning(Warnings.W112.format(name=name))
|
||||||
|
|
|
@ -10,7 +10,7 @@ from typing import cast
|
||||||
import warnings
|
import warnings
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import get_array_module, get_current_ops
|
from thinc.api import Ops, get_array_module, get_current_ops
|
||||||
from thinc.backends import get_array_ops
|
from thinc.backends import get_array_ops
|
||||||
from thinc.types import Floats2d
|
from thinc.types import Floats2d
|
||||||
|
|
||||||
|
@ -146,7 +146,7 @@ cdef class Vectors:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vectors#size
|
DOCS: https://spacy.io/api/vectors#size
|
||||||
"""
|
"""
|
||||||
return self.data.shape[0] * self.data.shape[1]
|
return self.data.size
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def is_full(self):
|
def is_full(self):
|
||||||
|
@ -517,6 +517,9 @@ cdef class Vectors:
|
||||||
for i in range(len(queries)) ], dtype="uint64")
|
for i in range(len(queries)) ], dtype="uint64")
|
||||||
return (keys, best_rows, scores)
|
return (keys, best_rows, scores)
|
||||||
|
|
||||||
|
def to_ops(self, ops: Ops):
|
||||||
|
self.data = ops.asarray(self.data)
|
||||||
|
|
||||||
def _get_cfg(self):
|
def _get_cfg(self):
|
||||||
if self.mode == Mode.default:
|
if self.mode == Mode.default:
|
||||||
return {
|
return {
|
||||||
|
|
|
@ -283,7 +283,7 @@ cdef class Vocab:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def vectors_length(self):
|
def vectors_length(self):
|
||||||
return self.vectors.data.shape[1]
|
return self.vectors.shape[1]
|
||||||
|
|
||||||
def reset_vectors(self, *, width=None, shape=None):
|
def reset_vectors(self, *, width=None, shape=None):
|
||||||
"""Drop the current vector table. Because all vectors must be the same
|
"""Drop the current vector table. Because all vectors must be the same
|
||||||
|
@ -294,7 +294,7 @@ cdef class Vocab:
|
||||||
elif shape is not None:
|
elif shape is not None:
|
||||||
self.vectors = Vectors(strings=self.strings, shape=shape)
|
self.vectors = Vectors(strings=self.strings, shape=shape)
|
||||||
else:
|
else:
|
||||||
width = width if width is not None else self.vectors.data.shape[1]
|
width = width if width is not None else self.vectors.shape[1]
|
||||||
self.vectors = Vectors(strings=self.strings, shape=(self.vectors.shape[0], width))
|
self.vectors = Vectors(strings=self.strings, shape=(self.vectors.shape[0], width))
|
||||||
|
|
||||||
def prune_vectors(self, nr_row, batch_size=1024):
|
def prune_vectors(self, nr_row, batch_size=1024):
|
||||||
|
|
|
@ -99,9 +99,9 @@ be a token pattern (list) or a phrase pattern (string). For example:
|
||||||
## EntityRuler.initialize {#initialize tag="method" new="3"}
|
## EntityRuler.initialize {#initialize tag="method" new="3"}
|
||||||
|
|
||||||
Initialize the component with data and used before training to load in rules
|
Initialize the component with data and used before training to load in rules
|
||||||
from a file. This method is typically called by
|
from a [pattern file](/usage/rule-based-matching/#entityruler-files). This method
|
||||||
[`Language.initialize`](/api/language#initialize) and lets you customize
|
is typically called by [`Language.initialize`](/api/language#initialize) and
|
||||||
arguments it receives via the
|
lets you customize arguments it receives via the
|
||||||
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
||||||
config.
|
config.
|
||||||
|
|
||||||
|
|
|
@ -44,6 +44,7 @@ rule-based matching are:
|
||||||
| `SPACY` | Token has a trailing space. ~~bool~~ |
|
| `SPACY` | Token has a trailing space. ~~bool~~ |
|
||||||
| `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~ |
|
| `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~ |
|
||||||
| `ENT_TYPE` | The token's entity label. ~~str~~ |
|
| `ENT_TYPE` | The token's entity label. ~~str~~ |
|
||||||
|
| `ENT_IOB` | The IOB part of the token's entity tag. ~~str~~ |
|
||||||
| `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ |
|
| `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ |
|
||||||
| `ENT_KB_ID` | The token's entity knowledge base ID (`ent_kb_id`). ~~str~~ |
|
| `ENT_KB_ID` | The token's entity knowledge base ID (`ent_kb_id`). ~~str~~ |
|
||||||
| `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
|
| `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
|
||||||
|
|
|
@ -371,6 +371,23 @@ Get the vectors for the provided keys efficiently as a batch.
|
||||||
| ------ | --------------------------------------- |
|
| ------ | --------------------------------------- |
|
||||||
| `keys` | The keys. ~~Iterable[Union[int, str]]~~ |
|
| `keys` | The keys. ~~Iterable[Union[int, str]]~~ |
|
||||||
|
|
||||||
|
## Vectors.to_ops {#to_ops tag="method"}
|
||||||
|
|
||||||
|
Change the embedding matrix to use different Thinc ops.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from thinc.api import NumpyOps
|
||||||
|
>
|
||||||
|
> vectors.to_ops(NumpyOps())
|
||||||
|
>
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
|-------|----------------------------------------------------------|
|
||||||
|
| `ops` | The Thinc ops to switch the embedding matrix to. ~~Ops~~ |
|
||||||
|
|
||||||
## Vectors.to_disk {#to_disk tag="method"}
|
## Vectors.to_disk {#to_disk tag="method"}
|
||||||
|
|
||||||
Save the current state to a directory.
|
Save the current state to a directory.
|
||||||
|
|
|
@ -1479,7 +1479,7 @@ especially useful it you want to pass in a string instead of calling
|
||||||
### Example: Pipeline component for GPE entities and country meta data via a REST API {#component-example3}
|
### Example: Pipeline component for GPE entities and country meta data via a REST API {#component-example3}
|
||||||
|
|
||||||
This example shows the implementation of a pipeline component that fetches
|
This example shows the implementation of a pipeline component that fetches
|
||||||
country meta data via the [REST Countries API](https://restcountries.eu), sets
|
country meta data via the [REST Countries API](https://restcountries.com), sets
|
||||||
entity annotations for countries and sets custom attributes on the `Doc` and
|
entity annotations for countries and sets custom attributes on the `Doc` and
|
||||||
`Span` – for example, the capital, latitude/longitude coordinates and even the
|
`Span` – for example, the capital, latitude/longitude coordinates and even the
|
||||||
country flag.
|
country flag.
|
||||||
|
@ -1495,7 +1495,7 @@ from spacy.tokens import Doc, Span, Token
|
||||||
@Language.factory("rest_countries")
|
@Language.factory("rest_countries")
|
||||||
class RESTCountriesComponent:
|
class RESTCountriesComponent:
|
||||||
def __init__(self, nlp, name, label="GPE"):
|
def __init__(self, nlp, name, label="GPE"):
|
||||||
r = requests.get("https://restcountries.eu/rest/v2/all")
|
r = requests.get("https://restcountries.com/v2/all")
|
||||||
r.raise_for_status() # make sure requests raises an error if it fails
|
r.raise_for_status() # make sure requests raises an error if it fails
|
||||||
countries = r.json()
|
countries = r.json()
|
||||||
# Convert API response to dict keyed by country name for easy lookup
|
# Convert API response to dict keyed by country name for easy lookup
|
||||||
|
|
|
@ -1770,9 +1770,9 @@
|
||||||
"title": "Applied Language Technology",
|
"title": "Applied Language Technology",
|
||||||
"slogan": "NLP for newcomers using spaCy and Stanza",
|
"slogan": "NLP for newcomers using spaCy and Stanza",
|
||||||
"description": "These learning materials provide an introduction to applied language technology for audiences who are unfamiliar with language technology and programming. The learning materials assume no previous knowledge of the Python programming language.",
|
"description": "These learning materials provide an introduction to applied language technology for audiences who are unfamiliar with language technology and programming. The learning materials assume no previous knowledge of the Python programming language.",
|
||||||
"url": "https://applied-language-technology.readthedocs.io/",
|
"url": "https://applied-language-technology.mooc.fi",
|
||||||
"image": "https://www.mv.helsinki.fi/home/thiippal/images/applt-preview.jpg",
|
"image": "https://www.mv.helsinki.fi/home/thiippal/images/applt-preview.jpg",
|
||||||
"thumb": "https://applied-language-technology.readthedocs.io/en/latest/_static/logo.png",
|
"thumb": "https://www.mv.helsinki.fi/home/thiippal/images/applt-logo.png",
|
||||||
"author": "Tuomo Hiippala",
|
"author": "Tuomo Hiippala",
|
||||||
"author_links": {
|
"author_links": {
|
||||||
"twitter": "tuomo_h",
|
"twitter": "tuomo_h",
|
||||||
|
|
|
@ -113,8 +113,7 @@ const QuickstartInstall = ({ id, title }) => {
|
||||||
{
|
{
|
||||||
id: 'venv',
|
id: 'venv',
|
||||||
title: 'virtual env',
|
title: 'virtual env',
|
||||||
help:
|
help: 'Use a virtual environment',
|
||||||
'Use a virtual environment and install spaCy into a user directory',
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
id: 'train',
|
id: 'train',
|
||||||
|
@ -165,27 +164,51 @@ const QuickstartInstall = ({ id, title }) => {
|
||||||
setters={setters}
|
setters={setters}
|
||||||
showDropdown={showDropdown}
|
showDropdown={showDropdown}
|
||||||
>
|
>
|
||||||
<QS config="venv">python -m venv .env</QS>
|
<QS package="pip" config="venv">
|
||||||
<QS config="venv" os="mac">
|
python -m venv .env
|
||||||
|
</QS>
|
||||||
|
<QS package="pip" config="venv" os="mac">
|
||||||
source .env/bin/activate
|
source .env/bin/activate
|
||||||
</QS>
|
</QS>
|
||||||
<QS config="venv" os="linux">
|
<QS package="pip" config="venv" os="linux">
|
||||||
source .env/bin/activate
|
source .env/bin/activate
|
||||||
</QS>
|
</QS>
|
||||||
<QS config="venv" os="windows">
|
<QS package="pip" config="venv" os="windows">
|
||||||
.env\Scripts\activate
|
.env\Scripts\activate
|
||||||
</QS>
|
</QS>
|
||||||
|
<QS package="source" config="venv">
|
||||||
|
python -m venv .env
|
||||||
|
</QS>
|
||||||
|
<QS package="source" config="venv" os="mac">
|
||||||
|
source .env/bin/activate
|
||||||
|
</QS>
|
||||||
|
<QS package="source" config="venv" os="linux">
|
||||||
|
source .env/bin/activate
|
||||||
|
</QS>
|
||||||
|
<QS package="source" config="venv" os="windows">
|
||||||
|
.env\Scripts\activate
|
||||||
|
</QS>
|
||||||
|
<QS package="conda" config="venv">
|
||||||
|
conda create -n venv
|
||||||
|
</QS>
|
||||||
|
<QS package="conda" config="venv">
|
||||||
|
conda activate venv
|
||||||
|
</QS>
|
||||||
<QS package="pip">pip install -U pip setuptools wheel</QS>
|
<QS package="pip">pip install -U pip setuptools wheel</QS>
|
||||||
<QS package="source">pip install -U pip setuptools wheel</QS>
|
<QS package="source">pip install -U pip setuptools wheel</QS>
|
||||||
<QS package="pip">
|
<QS package="pip">
|
||||||
pip install -U {pkg}
|
{pipExtras
|
||||||
{pipExtras && `[${pipExtras}]`}
|
? `pip install -U '${pkg}[${pipExtras}]'`
|
||||||
|
: `pip install -U ${pkg}`}
|
||||||
{nightly ? ' --pre' : ''}
|
{nightly ? ' --pre' : ''}
|
||||||
</QS>
|
</QS>
|
||||||
<QS package="conda">conda install -c conda-forge spacy</QS>
|
<QS package="conda">conda install -c conda-forge spacy</QS>
|
||||||
<QS package="conda" hardware="gpu">
|
<QS package="conda" hardware="gpu">
|
||||||
conda install -c conda-forge cupy
|
conda install -c conda-forge cupy
|
||||||
</QS>
|
</QS>
|
||||||
|
<QS package="conda" config="train">
|
||||||
|
conda install -c conda-forge spacy-transformers
|
||||||
|
</QS>
|
||||||
<QS package="source">
|
<QS package="source">
|
||||||
git clone https://github.com/{repo}
|
git clone https://github.com/{repo}
|
||||||
{nightly ? ` --branch ${DEFAULT_BRANCH}` : ''}
|
{nightly ? ` --branch ${DEFAULT_BRANCH}` : ''}
|
||||||
|
@ -205,9 +228,6 @@ const QuickstartInstall = ({ id, title }) => {
|
||||||
<QS config="train" package="conda" comment prompt={false}>
|
<QS config="train" package="conda" comment prompt={false}>
|
||||||
# packages only available via pip
|
# packages only available via pip
|
||||||
</QS>
|
</QS>
|
||||||
<QS config="train" package="conda">
|
|
||||||
pip install spacy-transformers
|
|
||||||
</QS>
|
|
||||||
<QS config="train" package="conda">
|
<QS config="train" package="conda">
|
||||||
pip install spacy-lookups-data
|
pip install spacy-lookups-data
|
||||||
</QS>
|
</QS>
|
||||||
|
|
Loading…
Reference in New Issue
Block a user