Merge branch 'develop' into nightly.spacy.io

This commit is contained in:
Ines Montani 2020-09-25 13:21:55 +02:00
commit f3aba49830
24 changed files with 651 additions and 152 deletions

View File

@ -1,6 +1,6 @@
# fmt: off # fmt: off
__title__ = "spacy-nightly" __title__ = "spacy-nightly"
__version__ = "3.0.0a24" __version__ = "3.0.0a25"
__release__ = True __release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

View File

@ -51,7 +51,7 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
update_lockfile(project_dir, cmd) update_lockfile(project_dir, cmd)
# We remove the command from the list here, and break, so that # We remove the command from the list here, and break, so that
# we iterate over the loop again. # we iterate over the loop again.
commands.remove(i) commands.pop(i)
break break
else: else:
# If we didn't break the for loop, break the while loop. # If we didn't break the for loop, break the while loop.

View File

@ -401,10 +401,6 @@ class Errors:
"Matcher or PhraseMatcher with the attribute {attr}. " "Matcher or PhraseMatcher with the attribute {attr}. "
"Try using nlp() instead of nlp.make_doc() or list(nlp.pipe()) " "Try using nlp() instead of nlp.make_doc() or list(nlp.pipe()) "
"instead of list(nlp.tokenizer.pipe()).") "instead of list(nlp.tokenizer.pipe()).")
E156 = ("The pipeline needs to include a parser in order to use "
"Matcher or PhraseMatcher with the attribute DEP. Try using "
"nlp() instead of nlp.make_doc() or list(nlp.pipe()) instead of "
"list(nlp.tokenizer.pipe()).")
E157 = ("Can't render negative values for dependency arc start or end. " E157 = ("Can't render negative values for dependency arc start or end. "
"Make sure that you're passing in absolute token indices, not " "Make sure that you're passing in absolute token indices, not "
"relative token offsets.\nstart: {start}, end: {end}, label: " "relative token offsets.\nstart: {start}, end: {end}, label: "
@ -517,8 +513,8 @@ class Errors:
"instead.") "instead.")
E927 = ("Can't write to frozen list Maybe you're trying to modify a computed " E927 = ("Can't write to frozen list Maybe you're trying to modify a computed "
"property or default function argument?") "property or default function argument?")
E928 = ("A 'KnowledgeBase' should be written to / read from a file, but the " E928 = ("A 'KnowledgeBase' can only be serialized to/from from a directory, "
"provided argument {loc} is an existing directory.") "but the provided argument {loc} points to a file.")
E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does " E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does "
"not seem to exist.") "not seem to exist.")
E930 = ("Received invalid get_examples callback in {name}.begin_training. " E930 = ("Received invalid get_examples callback in {name}.begin_training. "

View File

@ -10,6 +10,8 @@ from libcpp.vector cimport vector
from pathlib import Path from pathlib import Path
import warnings import warnings
from spacy.strings import StringStore
from spacy import util from spacy import util
from .typedefs cimport hash_t from .typedefs cimport hash_t
@ -83,6 +85,9 @@ cdef class KnowledgeBase:
DOCS: https://nightly.spacy.io/api/kb DOCS: https://nightly.spacy.io/api/kb
""" """
contents_loc = "contents"
strings_loc = "strings.json"
def __init__(self, Vocab vocab, entity_vector_length): def __init__(self, Vocab vocab, entity_vector_length):
"""Create a KnowledgeBase.""" """Create a KnowledgeBase."""
self.mem = Pool() self.mem = Pool()
@ -319,15 +324,29 @@ cdef class KnowledgeBase:
return 0.0 return 0.0
def to_disk(self, path): def to_disk(self, path):
path = util.ensure_path(path) path = util.ensure_path(path)
if path.is_dir(): if not path.exists():
path.mkdir(parents=True)
if not path.is_dir():
raise ValueError(Errors.E928.format(loc=path)) raise ValueError(Errors.E928.format(loc=path))
if not path.parent.exists(): self.write_contents(path / self.contents_loc)
path.parent.mkdir(parents=True) self.vocab.strings.to_disk(path / self.strings_loc)
cdef Writer writer = Writer(path) def from_disk(self, path):
path = util.ensure_path(path)
if not path.exists():
raise ValueError(Errors.E929.format(loc=path))
if not path.is_dir():
raise ValueError(Errors.E928.format(loc=path))
self.read_contents(path / self.contents_loc)
kb_strings = StringStore()
kb_strings.from_disk(path / self.strings_loc)
for string in kb_strings:
self.vocab.strings.add(string)
def write_contents(self, file_path):
cdef Writer writer = Writer(file_path)
writer.write_header(self.get_size_entities(), self.entity_vector_length) writer.write_header(self.get_size_entities(), self.entity_vector_length)
# dumping the entity vectors in their original order # dumping the entity vectors in their original order
@ -366,13 +385,7 @@ cdef class KnowledgeBase:
writer.close() writer.close()
def from_disk(self, path): def read_contents(self, file_path):
path = util.ensure_path(path)
if path.is_dir():
raise ValueError(Errors.E928.format(loc=path))
if not path.exists():
raise ValueError(Errors.E929.format(loc=path))
cdef hash_t entity_hash cdef hash_t entity_hash
cdef hash_t alias_hash cdef hash_t alias_hash
cdef int64_t entry_index cdef int64_t entry_index
@ -382,7 +395,7 @@ cdef class KnowledgeBase:
cdef AliasC alias cdef AliasC alias
cdef float vector_element cdef float vector_element
cdef Reader reader = Reader(path) cdef Reader reader = Reader(file_path)
# STEP 0: load header and initialize KB # STEP 0: load header and initialize KB
cdef int64_t nr_entities cdef int64_t nr_entities

View File

@ -17,6 +17,7 @@ from ..vocab cimport Vocab
from ..tokens.doc cimport Doc, get_token_attr_for_matcher from ..tokens.doc cimport Doc, get_token_attr_for_matcher
from ..tokens.span cimport Span from ..tokens.span cimport Span
from ..tokens.token cimport Token from ..tokens.token cimport Token
from ..tokens.morphanalysis cimport MorphAnalysis
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH
from ..schemas import validate_token_pattern from ..schemas import validate_token_pattern
@ -124,7 +125,7 @@ cdef class Matcher:
key = self._normalize_key(key) key = self._normalize_key(key)
for pattern in patterns: for pattern in patterns:
try: try:
specs = _preprocess_pattern(pattern, self.vocab.strings, specs = _preprocess_pattern(pattern, self.vocab,
self._extensions, self._extra_predicates) self._extensions, self._extra_predicates)
self.patterns.push_back(init_pattern(self.mem, key, specs)) self.patterns.push_back(init_pattern(self.mem, key, specs))
for spec in specs: for spec in specs:
@ -195,7 +196,7 @@ cdef class Matcher:
else: else:
yield doc yield doc
def __call__(self, object doclike, *, as_spans=False): def __call__(self, object doclike, *, as_spans=False, allow_missing=False):
"""Find all token sequences matching the supplied pattern. """Find all token sequences matching the supplied pattern.
doclike (Doc or Span): The document to match over. doclike (Doc or Span): The document to match over.
@ -215,16 +216,19 @@ cdef class Matcher:
else: else:
raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__)) raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__))
cdef Pool tmp_pool = Pool() cdef Pool tmp_pool = Pool()
if TAG in self._seen_attrs and not doc.has_annotation("TAG"): if not allow_missing:
raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG")) for attr in (TAG, POS, MORPH, LEMMA, DEP):
if POS in self._seen_attrs and not doc.has_annotation("POS"): if attr in self._seen_attrs and not doc.has_annotation(attr):
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS")) if attr == TAG:
if MORPH in self._seen_attrs and not doc.has_annotation("MORPH"): pipe = "tagger"
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH")) elif attr in (POS, MORPH):
if LEMMA in self._seen_attrs and not doc.has_annotation("LEMMA"): pipe = "morphologizer"
raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA")) elif attr == LEMMA:
if DEP in self._seen_attrs and not doc.has_annotation("DEP"): pipe = "lemmatizer"
raise ValueError(Errors.E156.format()) elif attr == DEP:
pipe = "parser"
error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
raise ValueError(error_msg)
matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length, matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
extensions=self._extensions, predicates=self._extra_predicates) extensions=self._extensions, predicates=self._extra_predicates)
final_matches = [] final_matches = []
@ -660,7 +664,7 @@ cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
return id_attr.value return id_attr.value
def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predicates): def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
"""This function interprets the pattern, converting the various bits of """This function interprets the pattern, converting the various bits of
syntactic sugar before we compile it into a struct with init_pattern. syntactic sugar before we compile it into a struct with init_pattern.
@ -675,6 +679,7 @@ def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predi
extra_predicates. extra_predicates.
""" """
tokens = [] tokens = []
string_store = vocab.strings
for spec in token_specs: for spec in token_specs:
if not spec: if not spec:
# Signifier for 'any token' # Signifier for 'any token'
@ -685,7 +690,7 @@ def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predi
ops = _get_operators(spec) ops = _get_operators(spec)
attr_values = _get_attr_values(spec, string_store) attr_values = _get_attr_values(spec, string_store)
extensions = _get_extensions(spec, string_store, extensions_table) extensions = _get_extensions(spec, string_store, extensions_table)
predicates = _get_extra_predicates(spec, extra_predicates) predicates = _get_extra_predicates(spec, extra_predicates, vocab)
for op in ops: for op in ops:
tokens.append((op, list(attr_values), list(extensions), list(predicates))) tokens.append((op, list(attr_values), list(extensions), list(predicates)))
return tokens return tokens
@ -729,7 +734,7 @@ def _get_attr_values(spec, string_store):
class _RegexPredicate: class _RegexPredicate:
operators = ("REGEX",) operators = ("REGEX",)
def __init__(self, i, attr, value, predicate, is_extension=False): def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
self.i = i self.i = i
self.attr = attr self.attr = attr
self.value = re.compile(value) self.value = re.compile(value)
@ -747,13 +752,18 @@ class _RegexPredicate:
return bool(self.value.search(value)) return bool(self.value.search(value))
class _SetMemberPredicate: class _SetPredicate:
operators = ("IN", "NOT_IN") operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET")
def __init__(self, i, attr, value, predicate, is_extension=False): def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
self.i = i self.i = i
self.attr = attr self.attr = attr
self.value = set(get_string_id(v) for v in value) self.vocab = vocab
if self.attr == MORPH:
# normalize morph strings
self.value = set(self.vocab.morphology.add(v) for v in value)
else:
self.value = set(get_string_id(v) for v in value)
self.predicate = predicate self.predicate = predicate
self.is_extension = is_extension self.is_extension = is_extension
self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
@ -765,19 +775,32 @@ class _SetMemberPredicate:
value = get_string_id(token._.get(self.attr)) value = get_string_id(token._.get(self.attr))
else: else:
value = get_token_attr_for_matcher(token.c, self.attr) value = get_token_attr_for_matcher(token.c, self.attr)
if self.predicate in ("IS_SUBSET", "IS_SUPERSET"):
if self.attr == MORPH:
# break up MORPH into individual Feat=Val values
value = set(get_string_id(v) for v in MorphAnalysis.from_id(self.vocab, value))
else:
# IS_SUBSET for other attrs will be equivalent to "IN"
# IS_SUPERSET will only match for other attrs with 0 or 1 values
value = set([value])
if self.predicate == "IN": if self.predicate == "IN":
return value in self.value return value in self.value
else: elif self.predicate == "NOT_IN":
return value not in self.value return value not in self.value
elif self.predicate == "IS_SUBSET":
return value <= self.value
elif self.predicate == "IS_SUPERSET":
return value >= self.value
def __repr__(self): def __repr__(self):
return repr(("SetMemberPredicate", self.i, self.attr, self.value, self.predicate)) return repr(("SetPredicate", self.i, self.attr, self.value, self.predicate))
class _ComparisonPredicate: class _ComparisonPredicate:
operators = ("==", "!=", ">=", "<=", ">", "<") operators = ("==", "!=", ">=", "<=", ">", "<")
def __init__(self, i, attr, value, predicate, is_extension=False): def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
self.i = i self.i = i
self.attr = attr self.attr = attr
self.value = value self.value = value
@ -806,11 +829,13 @@ class _ComparisonPredicate:
return value < self.value return value < self.value
def _get_extra_predicates(spec, extra_predicates): def _get_extra_predicates(spec, extra_predicates, vocab):
predicate_types = { predicate_types = {
"REGEX": _RegexPredicate, "REGEX": _RegexPredicate,
"IN": _SetMemberPredicate, "IN": _SetPredicate,
"NOT_IN": _SetMemberPredicate, "NOT_IN": _SetPredicate,
"IS_SUBSET": _SetPredicate,
"IS_SUPERSET": _SetPredicate,
"==": _ComparisonPredicate, "==": _ComparisonPredicate,
"!=": _ComparisonPredicate, "!=": _ComparisonPredicate,
">=": _ComparisonPredicate, ">=": _ComparisonPredicate,
@ -838,7 +863,7 @@ def _get_extra_predicates(spec, extra_predicates):
value_with_upper_keys = {k.upper(): v for k, v in value.items()} value_with_upper_keys = {k.upper(): v for k, v in value.items()}
for type_, cls in predicate_types.items(): for type_, cls in predicate_types.items():
if type_ in value_with_upper_keys: if type_ in value_with_upper_keys:
predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_) predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_, vocab=vocab)
# Don't create a redundant predicates. # Don't create a redundant predicates.
# This helps with efficiency, as we're caching the results. # This helps with efficiency, as we're caching the results.
if predicate.key in seen_predicates: if predicate.key in seen_predicates:

View File

@ -186,16 +186,18 @@ cdef class PhraseMatcher:
if isinstance(doc, Doc): if isinstance(doc, Doc):
attrs = (TAG, POS, MORPH, LEMMA, DEP) attrs = (TAG, POS, MORPH, LEMMA, DEP)
has_annotation = {attr: doc.has_annotation(attr) for attr in attrs} has_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
if self.attr == TAG and not has_annotation[TAG]: for attr in attrs:
raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG")) if self.attr == attr and not has_annotation[attr]:
if self.attr == POS and not has_annotation[POS]: if attr == TAG:
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS")) pipe = "tagger"
if self.attr == MORPH and not has_annotation[MORPH]: elif attr in (POS, MORPH):
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH")) pipe = "morphologizer"
if self.attr == LEMMA and not has_annotation[LEMMA]: elif attr == LEMMA:
raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA")) pipe = "lemmatizer"
if self.attr == DEP and not has_annotation[DEP]: elif attr == DEP:
raise ValueError(Errors.E156.format()) pipe = "parser"
error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
raise ValueError(error_msg)
if self._validate and any(has_annotation.values()) \ if self._validate and any(has_annotation.values()) \
and self.attr not in attrs: and self.attr not in attrs:
string_attr = self.vocab.strings[self.attr] string_attr = self.vocab.strings[self.attr]

View File

@ -79,7 +79,7 @@ class AttributeRuler(Pipe):
DOCS: https://nightly.spacy.io/api/attributeruler#call DOCS: https://nightly.spacy.io/api/attributeruler#call
""" """
matches = sorted(self.matcher(doc)) matches = sorted(self.matcher(doc, allow_missing=True))
for match_id, start, end in matches: for match_id, start, end in matches:
span = Span(doc, start, end, label=match_id) span = Span(doc, start, end, label=match_id)
@ -126,8 +126,12 @@ class AttributeRuler(Pipe):
for tag, attrs in tag_map.items(): for tag, attrs in tag_map.items():
pattern = [{"TAG": tag}] pattern = [{"TAG": tag}]
attrs, morph_attrs = _split_morph_attrs(attrs) attrs, morph_attrs = _split_morph_attrs(attrs)
morph = self.vocab.morphology.add(morph_attrs) if "MORPH" not in attrs:
attrs["MORPH"] = self.vocab.strings[morph] morph = self.vocab.morphology.add(morph_attrs)
attrs["MORPH"] = self.vocab.strings[morph]
else:
morph = self.vocab.morphology.add(attrs["MORPH"])
attrs["MORPH"] = self.vocab.strings[morph]
self.add([pattern], attrs) self.add([pattern], attrs)
def load_from_morph_rules( def load_from_morph_rules(
@ -146,8 +150,12 @@ class AttributeRuler(Pipe):
pattern = [{"ORTH": word, "TAG": tag}] pattern = [{"ORTH": word, "TAG": tag}]
attrs = morph_rules[tag][word] attrs = morph_rules[tag][word]
attrs, morph_attrs = _split_morph_attrs(attrs) attrs, morph_attrs = _split_morph_attrs(attrs)
morph = self.vocab.morphology.add(morph_attrs) if "MORPH" in attrs:
attrs["MORPH"] = self.vocab.strings[morph] morph = self.vocab.morphology.add(attrs["MORPH"])
attrs["MORPH"] = self.vocab.strings[morph]
elif morph_attrs:
morph = self.vocab.morphology.add(morph_attrs)
attrs["MORPH"] = self.vocab.strings[morph]
self.add([pattern], attrs) self.add([pattern], attrs)
def add( def add(

View File

@ -16,6 +16,7 @@ from ..training import Example, validate_examples
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from ..util import SimpleFrozenList from ..util import SimpleFrozenList
from .. import util from .. import util
from ..scorer import Scorer
default_model_config = """ default_model_config = """
@ -47,6 +48,11 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
"incl_context": True, "incl_context": True,
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"}, "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
}, },
default_score_weights={
"nel_micro_f": 1.0,
"nel_micro_r": None,
"nel_micro_p": None,
},
) )
def make_entity_linker( def make_entity_linker(
nlp: Language, nlp: Language,
@ -209,12 +215,11 @@ class EntityLinker(Pipe):
# it does run the model twice :( # it does run the model twice :(
predictions = self.model.predict(docs) predictions = self.model.predict(docs)
for eg in examples: for eg in examples:
sentences = [s for s in eg.predicted.sents] sentences = [s for s in eg.reference.sents]
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True) kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
for ent in eg.predicted.ents: for ent in eg.reference.ents:
kb_id = kb_ids[ # KB ID of the first token is the same as the whole span
ent.start kb_id = kb_ids[ent.start]
] # KB ID of the first token is the same as the whole span
if kb_id: if kb_id:
try: try:
# find the sentence in the list of sentences. # find the sentence in the list of sentences.
@ -253,7 +258,7 @@ class EntityLinker(Pipe):
entity_encodings = [] entity_encodings = []
for eg in examples: for eg in examples:
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True) kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
for ent in eg.predicted.ents: for ent in eg.reference.ents:
kb_id = kb_ids[ent.start] kb_id = kb_ids[ent.start]
if kb_id: if kb_id:
entity_encoding = self.kb.get_vector(kb_id) entity_encoding = self.kb.get_vector(kb_id)
@ -415,6 +420,17 @@ class EntityLinker(Pipe):
for token in ent: for token in ent:
token.ent_kb_id_ = kb_id token.ent_kb_id_ = kb_id
def score(self, examples, **kwargs):
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores.
DOCS TODO: https://nightly.spacy.io/api/entity_linker#score
"""
validate_examples(examples, "EntityLinker.score")
return Scorer.score_links(examples, negative_labels=[self.NIL])
def to_disk( def to_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
) -> None: ) -> None:

View File

@ -6,7 +6,7 @@ from .transition_parser cimport Parser
from ._parser_internals.ner cimport BiluoPushDown from ._parser_internals.ner cimport BiluoPushDown
from ..language import Language from ..language import Language
from ..scorer import Scorer from ..scorer import get_ner_prf, PRFScore
from ..training import validate_examples from ..training import validate_examples
@ -117,9 +117,18 @@ cdef class EntityRecognizer(Parser):
"""Score a batch of examples. """Score a batch of examples.
examples (Iterable[Example]): The examples to score. examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans. RETURNS (Dict[str, Any]): The NER precision, recall and f-scores.
DOCS: https://nightly.spacy.io/api/entityrecognizer#score DOCS: https://nightly.spacy.io/api/entityrecognizer#score
""" """
validate_examples(examples, "EntityRecognizer.score") validate_examples(examples, "EntityRecognizer.score")
return Scorer.score_spans(examples, "ents", **kwargs) score_per_type = get_ner_prf(examples)
totals = PRFScore()
for prf in score_per_type.values():
totals += prf
return {
"ents_p": totals.precision,
"ents_r": totals.recall,
"ents_f": totals.fscore,
"ents_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
}

View File

@ -61,6 +61,8 @@ class TokenPatternString(BaseModel):
REGEX: Optional[StrictStr] = Field(None, alias="regex") REGEX: Optional[StrictStr] = Field(None, alias="regex")
IN: Optional[List[StrictStr]] = Field(None, alias="in") IN: Optional[List[StrictStr]] = Field(None, alias="in")
NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in") NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in")
IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset")
IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
class Config: class Config:
extra = "forbid" extra = "forbid"
@ -77,6 +79,8 @@ class TokenPatternNumber(BaseModel):
REGEX: Optional[StrictStr] = Field(None, alias="regex") REGEX: Optional[StrictStr] = Field(None, alias="regex")
IN: Optional[List[StrictInt]] = Field(None, alias="in") IN: Optional[List[StrictInt]] = Field(None, alias="in")
NOT_IN: Optional[List[StrictInt]] = Field(None, alias="not_in") NOT_IN: Optional[List[StrictInt]] = Field(None, alias="not_in")
ISSUBSET: Optional[List[StrictInt]] = Field(None, alias="issubset")
ISSUPERSET: Optional[List[StrictInt]] = Field(None, alias="issuperset")
EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==") EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==")
NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=") NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=")
GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=") GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=")
@ -115,6 +119,7 @@ class TokenPattern(BaseModel):
lower: Optional[StringValue] = None lower: Optional[StringValue] = None
pos: Optional[StringValue] = None pos: Optional[StringValue] = None
tag: Optional[StringValue] = None tag: Optional[StringValue] = None
morph: Optional[StringValue] = None
dep: Optional[StringValue] = None dep: Optional[StringValue] = None
lemma: Optional[StringValue] = None lemma: Optional[StringValue] = None
shape: Optional[StringValue] = None shape: Optional[StringValue] = None

View File

@ -1,5 +1,6 @@
from typing import Optional, Iterable, Dict, Any, Callable, TYPE_CHECKING from typing import Optional, Iterable, Dict, Any, Callable, TYPE_CHECKING
import numpy as np import numpy as np
from collections import defaultdict
from .training import Example from .training import Example
from .tokens import Token, Doc, Span from .tokens import Token, Doc, Span
@ -23,6 +24,19 @@ class PRFScore:
self.fp = 0 self.fp = 0
self.fn = 0 self.fn = 0
def __iadd__(self, other):
self.tp += other.tp
self.fp += other.fp
self.fn += other.fn
return self
def __add__(self, other):
return PRFScore(
tp=self.tp+other.tp,
fp=self.fp+other.fp,
fn=self.fn+other.fn
)
def score_set(self, cand: set, gold: set) -> None: def score_set(self, cand: set, gold: set) -> None:
self.tp += len(cand.intersection(gold)) self.tp += len(cand.intersection(gold))
self.fp += len(cand - gold) self.fp += len(cand - gold)
@ -295,12 +309,6 @@ class Scorer:
# Find all predidate labels, for all and per type # Find all predidate labels, for all and per type
gold_spans = set() gold_spans = set()
pred_spans = set() pred_spans = set()
# Special case for ents:
# If we have missing values in the gold, we can't easily tell
# whether our NER predictions are true.
# It seems bad but it's what we've always done.
if attr == "ents" and not all(token.ent_iob != 0 for token in gold_doc):
continue
for span in getter(gold_doc, attr): for span in getter(gold_doc, attr):
gold_span = (span.label_, span.start, span.end - 1) gold_span = (span.label_, span.start, span.end - 1)
gold_spans.add(gold_span) gold_spans.add(gold_span)
@ -451,6 +459,74 @@ class Scorer:
results[f"{attr}_score_desc"] = "macro AUC" results[f"{attr}_score_desc"] = "macro AUC"
return results return results
@staticmethod
def score_links(
examples: Iterable[Example], *, negative_labels: Iterable[str]
) -> Dict[str, Any]:
"""Returns PRF for predicted links on the entity level.
To disentangle the performance of the NEL from the NER,
this method only evaluates NEL links for entities that overlap
between the gold reference and the predictions.
examples (Iterable[Example]): Examples to score
negative_labels (Iterable[str]): The string values that refer to no annotation (e.g. "NIL")
RETURNS (Dict[str, Any]): A dictionary containing the scores.
DOCS (TODO): https://nightly.spacy.io/api/scorer#score_links
"""
f_per_type = {}
for example in examples:
gold_ent_by_offset = {}
for gold_ent in example.reference.ents:
gold_ent_by_offset[(gold_ent.start_char, gold_ent.end_char)] = gold_ent
for pred_ent in example.predicted.ents:
gold_span = gold_ent_by_offset.get(
(pred_ent.start_char, pred_ent.end_char), None
)
label = gold_span.label_
if not label in f_per_type:
f_per_type[label] = PRFScore()
gold = gold_span.kb_id_
# only evaluating entities that overlap between gold and pred,
# to disentangle the performance of the NEL from the NER
if gold is not None:
pred = pred_ent.kb_id_
if gold in negative_labels and pred in negative_labels:
# ignore true negatives
pass
elif gold == pred:
f_per_type[label].tp += 1
elif gold in negative_labels:
f_per_type[label].fp += 1
elif pred in negative_labels:
f_per_type[label].fn += 1
else:
# a wrong prediction (e.g. Q42 != Q3) counts as both a FP as well as a FN
f_per_type[label].fp += 1
f_per_type[label].fn += 1
micro_prf = PRFScore()
for label_prf in f_per_type.values():
micro_prf.tp += label_prf.tp
micro_prf.fn += label_prf.fn
micro_prf.fp += label_prf.fp
n_labels = len(f_per_type) + 1e-100
macro_p = sum(prf.precision for prf in f_per_type.values()) / n_labels
macro_r = sum(prf.recall for prf in f_per_type.values()) / n_labels
macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_labels
results = {
f"nel_score": micro_prf.fscore,
f"nel_score_desc": "micro F",
f"nel_micro_p": micro_prf.precision,
f"nel_micro_r": micro_prf.recall,
f"nel_micro_f": micro_prf.fscore,
f"nel_macro_p": macro_p,
f"nel_macro_r": macro_r,
f"nel_macro_f": macro_f,
f"nel_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
}
return results
@staticmethod @staticmethod
def score_deps( def score_deps(
examples: Iterable[Example], examples: Iterable[Example],
@ -545,6 +621,39 @@ class Scorer:
} }
def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]:
"""Compute per-entity PRFScore objects for a sequence of examples. The
results are returned as a dictionary keyed by the entity type. You can
add the PRFScore objects to get micro-averaged total.
"""
scores = defaultdict(PRFScore)
for eg in examples:
if not eg.y.has_annotation("ENT_IOB"):
continue
golds = {(e.label_, e.start, e.end) for e in eg.y.ents}
align_x2y = eg.alignment.x2y
preds = set()
for pred_ent in eg.x.ents:
if pred_ent.label_ not in scores:
scores[pred_ent.label_] = PRFScore()
indices = align_x2y[pred_ent.start : pred_ent.end].dataXd.ravel()
if len(indices):
g_span = eg.y[indices[0] : indices[-1] + 1]
# Check we aren't missing annotation on this span. If so,
# our prediction is neither right nor wrong, we just
# ignore it.
if all(token.ent_iob != 0 for token in g_span):
key = (pred_ent.label_, indices[0], indices[-1] + 1)
if key in golds:
scores[pred_ent.label_].tp += 1
golds.remove(key)
else:
scores[pred_ent.label_].fp += 1
for label, start, end in golds:
scores[label].fn += 1
return scores
############################################################################# #############################################################################
# #
# The following implementation of roc_auc_score() is adapted from # The following implementation of roc_auc_score() is adapted from

View File

@ -230,6 +230,106 @@ def test_matcher_set_value_operator(en_vocab):
assert len(matches) == 1 assert len(matches) == 1
def test_matcher_subset_value_operator(en_vocab):
matcher = Matcher(en_vocab)
pattern = [{"MORPH": {"IS_SUBSET": ["Feat=Val", "Feat2=Val2"]}}]
matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"])
assert len(matcher(doc)) == 3
doc[0].morph_ = "Feat=Val"
assert len(matcher(doc)) == 3
doc[0].morph_ = "Feat=Val|Feat2=Val2"
assert len(matcher(doc)) == 3
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3"
assert len(matcher(doc)) == 2
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4"
assert len(matcher(doc)) == 2
# IS_SUBSET acts like "IN" for attrs other than MORPH
matcher = Matcher(en_vocab)
pattern = [{"TAG": {"IS_SUBSET": ["A", "B"]}}]
matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"])
doc[0].tag_ = "A"
assert len(matcher(doc)) == 1
# IS_SUBSET with an empty list matches nothing
matcher = Matcher(en_vocab)
pattern = [{"TAG": {"IS_SUBSET": []}}]
matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"])
doc[0].tag_ = "A"
assert len(matcher(doc)) == 0
def test_matcher_superset_value_operator(en_vocab):
matcher = Matcher(en_vocab)
pattern = [{"MORPH": {"IS_SUPERSET": ["Feat=Val", "Feat2=Val2", "Feat3=Val3"]}}]
matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"])
assert len(matcher(doc)) == 0
doc[0].morph_ = "Feat=Val|Feat2=Val2"
assert len(matcher(doc)) == 0
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3"
assert len(matcher(doc)) == 1
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4"
assert len(matcher(doc)) == 1
# IS_SUPERSET with more than one value only matches for MORPH
matcher = Matcher(en_vocab)
pattern = [{"TAG": {"IS_SUPERSET": ["A", "B"]}}]
matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"])
doc[0].tag_ = "A"
assert len(matcher(doc)) == 0
# IS_SUPERSET with one value is the same as ==
matcher = Matcher(en_vocab)
pattern = [{"TAG": {"IS_SUPERSET": ["A"]}}]
matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"])
doc[0].tag_ = "A"
assert len(matcher(doc)) == 1
# IS_SUPERSET with an empty value matches everything
matcher = Matcher(en_vocab)
pattern = [{"TAG": {"IS_SUPERSET": []}}]
matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"])
doc[0].tag_ = "A"
assert len(matcher(doc)) == 3
def test_matcher_morph_handling(en_vocab):
# order of features in pattern doesn't matter
matcher = Matcher(en_vocab)
pattern1 = [{"MORPH": {"IN": ["Feat1=Val1|Feat2=Val2"]}}]
pattern2 = [{"MORPH": {"IN": ["Feat2=Val2|Feat1=Val1"]}}]
matcher.add("M", [pattern1])
matcher.add("N", [pattern2])
doc = Doc(en_vocab, words=["a", "b", "c"])
assert len(matcher(doc)) == 0
doc[0].morph_ = "Feat2=Val2|Feat1=Val1"
assert len(matcher(doc)) == 2
doc[0].morph_ = "Feat1=Val1|Feat2=Val2"
assert len(matcher(doc)) == 2
# multiple values are split
matcher = Matcher(en_vocab)
pattern1 = [{"MORPH": {"IS_SUPERSET": ["Feat1=Val1", "Feat2=Val2"]}}]
pattern2 = [{"MORPH": {"IS_SUPERSET": ["Feat1=Val1", "Feat1=Val3", "Feat2=Val2"]}}]
matcher.add("M", [pattern1])
matcher.add("N", [pattern2])
doc = Doc(en_vocab, words=["a", "b", "c"])
assert len(matcher(doc)) == 0
doc[0].morph_ = "Feat2=Val2,Val3|Feat1=Val1"
assert len(matcher(doc)) == 1
doc[0].morph_ = "Feat1=Val1,Val3|Feat2=Val2"
assert len(matcher(doc)) == 2
def test_matcher_regex(en_vocab): def test_matcher_regex(en_vocab):
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
pattern = [{"ORTH": {"REGEX": r"(?:a|an)"}}] pattern = [{"ORTH": {"REGEX": r"(?:a|an)"}}]
@ -316,6 +416,9 @@ def test_attr_pipeline_checks(en_vocab):
matcher(doc2) matcher(doc2)
with pytest.raises(ValueError): with pytest.raises(ValueError):
matcher(doc3) matcher(doc3)
# errors can be suppressed if desired
matcher(doc2, allow_missing=True)
matcher(doc3, allow_missing=True)
# TAG, POS, LEMMA require those values # TAG, POS, LEMMA require those values
for attr in ("TAG", "POS", "LEMMA"): for attr in ("TAG", "POS", "LEMMA"):
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)

View File

@ -2,8 +2,10 @@ from typing import Callable, Iterable
import pytest import pytest
from spacy.kb import KnowledgeBase, get_candidates, Candidate from spacy.kb import KnowledgeBase, get_candidates, Candidate
from spacy.vocab import Vocab
from spacy import util, registry from spacy import util, registry
from spacy.scorer import Scorer
from spacy.training import Example from spacy.training import Example
from spacy.lang.en import English from spacy.lang.en import English
from spacy.tests.util import make_tempdir from spacy.tests.util import make_tempdir
@ -151,22 +153,15 @@ def test_kb_serialize(nlp):
# normal read-write behaviour # normal read-write behaviour
mykb.to_disk(d / "kb") mykb.to_disk(d / "kb")
mykb.from_disk(d / "kb") mykb.from_disk(d / "kb")
mykb.to_disk(d / "kb.file")
mykb.from_disk(d / "kb.file")
mykb.to_disk(d / "new" / "kb") mykb.to_disk(d / "new" / "kb")
mykb.from_disk(d / "new" / "kb") mykb.from_disk(d / "new" / "kb")
# allow overwriting an existing file # allow overwriting an existing file
mykb.to_disk(d / "kb.file") mykb.to_disk(d / "kb")
with pytest.raises(ValueError):
# can not write to a directory
mykb.to_disk(d)
with pytest.raises(ValueError):
# can not read from a directory
mykb.from_disk(d)
with pytest.raises(ValueError): with pytest.raises(ValueError):
# can not read from an unknown file # can not read from an unknown file
mykb.from_disk(d / "unknown" / "kb") mykb.from_disk(d / "unknown" / "kb")
def test_candidate_generation(nlp): def test_candidate_generation(nlp):
"""Test correct candidate generation""" """Test correct candidate generation"""
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
@ -254,6 +249,41 @@ def test_el_pipe_configuration(nlp):
assert doc[2].ent_kb_id_ == "Q2" assert doc[2].ent_kb_id_ == "Q2"
def test_vocab_serialization(nlp):
"""Test that string information is retained across storage"""
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
# adding entities
q1_hash = mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
q2_hash = mykb.add_entity(entity="Q2", freq=12, entity_vector=[2])
q3_hash = mykb.add_entity(entity="Q3", freq=5, entity_vector=[3])
# adding aliases
douglas_hash = mykb.add_alias(
alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1]
)
adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
candidates = mykb.get_alias_candidates("adam")
assert len(candidates) == 1
assert candidates[0].entity == q2_hash
assert candidates[0].entity_ == "Q2"
assert candidates[0].alias == adam_hash
assert candidates[0].alias_ == "adam"
with make_tempdir() as d:
mykb.to_disk(d / "kb")
kb_new_vocab = KnowledgeBase(Vocab(), entity_vector_length=1)
kb_new_vocab.from_disk(d / "kb")
candidates = kb_new_vocab.get_alias_candidates("adam")
assert len(candidates) == 1
assert candidates[0].entity == q2_hash
assert candidates[0].entity_ == "Q2"
assert candidates[0].alias == adam_hash
assert candidates[0].alias_ == "adam"
def test_append_alias(nlp): def test_append_alias(nlp):
"""Test that we can append additional alias-entity pairs""" """Test that we can append additional alias-entity pairs"""
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
@ -377,16 +407,20 @@ def test_preserving_links_ents_2(nlp):
TRAIN_DATA = [ TRAIN_DATA = [
("Russ Cochran captured his first major title with his son as caddie.", ("Russ Cochran captured his first major title with his son as caddie.",
{"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}}, {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}},
"entities": [(0, 12, "PERSON")]}), "entities": [(0, 12, "PERSON")],
"sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}),
("Russ Cochran his reprints include EC Comics.", ("Russ Cochran his reprints include EC Comics.",
{"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}, {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}},
"entities": [(0, 12, "PERSON")]}), "entities": [(0, 12, "PERSON")],
"sent_starts": [1, -1, 0, 0, 0, 0, 0, 0]}),
("Russ Cochran has been publishing comic art.", ("Russ Cochran has been publishing comic art.",
{"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}, {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}},
"entities": [(0, 12, "PERSON")]}), "entities": [(0, 12, "PERSON")],
"sent_starts": [1, -1, 0, 0, 0, 0, 0, 0]}),
("Russ Cochran was a member of University of Kentucky's golf team.", ("Russ Cochran was a member of University of Kentucky's golf team.",
{"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}}, {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}},
"entities": [(0, 12, "PERSON"), (43, 51, "LOC")]}), "entities": [(0, 12, "PERSON"), (43, 51, "LOC")],
"sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]})
] ]
GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"] GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
# fmt: on # fmt: on
@ -395,16 +429,8 @@ GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
def test_overfitting_IO(): def test_overfitting_IO():
# Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
nlp = English() nlp = English()
nlp.add_pipe("sentencizer")
vector_length = 3 vector_length = 3
# Add a custom component to recognize "Russ Cochran" as an entity for the example training data
patterns = [
{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}
]
ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns)
# Convert the texts to docs to make sure we have doc.ents set for the training examples # Convert the texts to docs to make sure we have doc.ents set for the training examples
train_examples = [] train_examples = []
for text, annotation in TRAIN_DATA: for text, annotation in TRAIN_DATA:
@ -446,6 +472,16 @@ def test_overfitting_IO():
nlp.update(train_examples, sgd=optimizer, losses=losses) nlp.update(train_examples, sgd=optimizer, losses=losses)
assert losses["entity_linker"] < 0.001 assert losses["entity_linker"] < 0.001
# adding additional components that are required for the entity_linker
nlp.add_pipe("sentencizer", first=True)
# Add a custom component to recognize "Russ Cochran" as an entity for the example training data
patterns = [
{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}
]
ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
ruler.add_patterns(patterns)
# test the trained model # test the trained model
predictions = [] predictions = []
for text, annotation in TRAIN_DATA: for text, annotation in TRAIN_DATA:
@ -465,3 +501,46 @@ def test_overfitting_IO():
for ent in doc2.ents: for ent in doc2.ents:
predictions.append(ent.kb_id_) predictions.append(ent.kb_id_)
assert predictions == GOLD_entities assert predictions == GOLD_entities
def test_scorer_links():
train_examples = []
nlp = English()
ref1 = nlp("Julia lives in London happily.")
ref1.ents = [
Span(ref1, 0, 1, label="PERSON", kb_id="Q2"),
Span(ref1, 3, 4, label="LOC", kb_id="Q3"),
]
pred1 = nlp("Julia lives in London happily.")
pred1.ents = [
Span(pred1, 0, 1, label="PERSON", kb_id="Q70"),
Span(pred1, 3, 4, label="LOC", kb_id="Q3"),
]
train_examples.append(Example(pred1, ref1))
ref2 = nlp("She loves London.")
ref2.ents = [
Span(ref2, 0, 1, label="PERSON", kb_id="Q2"),
Span(ref2, 2, 3, label="LOC", kb_id="Q13"),
]
pred2 = nlp("She loves London.")
pred2.ents = [
Span(pred2, 0, 1, label="PERSON", kb_id="Q2"),
Span(pred2, 2, 3, label="LOC", kb_id="NIL"),
]
train_examples.append(Example(pred2, ref2))
ref3 = nlp("London is great.")
ref3.ents = [Span(ref3, 0, 1, label="LOC", kb_id="NIL")]
pred3 = nlp("London is great.")
pred3.ents = [Span(pred3, 0, 1, label="LOC", kb_id="NIL")]
train_examples.append(Example(pred3, ref3))
scores = Scorer().score_links(train_examples, negative_labels=["NIL"])
assert scores["nel_f_per_type"]["PERSON"]["p"] == 1 / 2
assert scores["nel_f_per_type"]["PERSON"]["r"] == 1 / 2
assert scores["nel_f_per_type"]["LOC"]["p"] == 1 / 1
assert scores["nel_f_per_type"]["LOC"]["r"] == 1 / 2
assert scores["nel_micro_p"] == 2 / 3
assert scores["nel_micro_r"] == 2 / 4

View File

@ -345,12 +345,13 @@ def test_language_factories_invalid():
[{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}], [{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}],
{"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25}, {"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25},
), ),
([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75},), ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75}),
([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"a": 0.0, "b": 0.0, "c": 0.0}),
], ],
) )
def test_language_factories_combine_score_weights(weights, expected): def test_language_factories_combine_score_weights(weights, expected):
result = combine_score_weights(weights) result = combine_score_weights(weights)
assert sum(result.values()) in (0.99, 1.0) assert sum(result.values()) in (0.99, 1.0, 0.0)
assert result == expected assert result == expected

View File

@ -244,3 +244,22 @@ def test_Example_from_dict_with_links_invalid(annots):
predicted = Doc(vocab, words=annots["words"]) predicted = Doc(vocab, words=annots["words"])
with pytest.raises(ValueError): with pytest.raises(ValueError):
Example.from_dict(predicted, annots) Example.from_dict(predicted, annots)
def test_Example_from_dict_sentences():
vocab = Vocab()
predicted = Doc(vocab, words=["One", "sentence", ".", "one", "more"])
annots = {"sent_starts": [1, 0, 0, 1, 0]}
ex = Example.from_dict(predicted, annots)
assert len(list(ex.reference.sents)) == 2
# this currently throws an error - bug or feature?
# predicted = Doc(vocab, words=["One", "sentence", "not", "one", "more"])
# annots = {"sent_starts": [1, 0, 0, 0, 0]}
# ex = Example.from_dict(predicted, annots)
# assert len(list(ex.reference.sents)) == 1
predicted = Doc(vocab, words=["One", "sentence", "not", "one", "more"])
annots = {"sent_starts": [1, -1, 0, 0, 0]}
ex = Example.from_dict(predicted, annots)
assert len(list(ex.reference.sents)) == 1

View File

@ -1,4 +1,5 @@
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from libc.stdint cimport uint64_t
cdef class Example: cdef class Example:
@ -7,3 +8,5 @@ cdef class Example:
cdef readonly object _cached_alignment cdef readonly object _cached_alignment
cdef readonly object _cached_words_x cdef readonly object _cached_words_x
cdef readonly object _cached_words_y cdef readonly object _cached_words_y
cdef readonly uint64_t _x_sig
cdef readonly uint64_t _y_sig

View File

@ -1,6 +1,7 @@
from collections import Iterable as IterableInstance from collections import Iterable as IterableInstance
import warnings import warnings
import numpy import numpy
from murmurhash.mrmr cimport hash64
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from ..tokens.span cimport Span from ..tokens.span cimport Span
@ -97,15 +98,36 @@ cdef class Example:
@property @property
def alignment(self): def alignment(self):
words_x = [token.text for token in self.x] x_sig = hash64(self.x.c, sizeof(self.x.c[0]) * self.x.length, 0)
words_y = [token.text for token in self.y] y_sig = hash64(self.y.c, sizeof(self.y.c[0]) * self.y.length, 0)
if self._cached_alignment is None or \ if self._cached_alignment is None:
words_x != self._cached_words_x or \ words_x = [token.text for token in self.x]
words_y != self._cached_words_y: words_y = [token.text for token in self.y]
self._cached_alignment = Alignment.from_strings(words_x, words_y) self._x_sig = x_sig
self._y_sig = y_sig
self._cached_words_x = words_x self._cached_words_x = words_x
self._cached_words_y = words_y self._cached_words_y = words_y
return self._cached_alignment self._cached_alignment = Alignment.from_strings(words_x, words_y)
return self._cached_alignment
elif self._x_sig == x_sig and self._y_sig == y_sig:
# If we have a cached alignment, check whether the cache is invalid
# due to retokenization. To make this check fast in loops, we first
# check a hash of the TokenC arrays.
return self._cached_alignment
else:
words_x = [token.text for token in self.x]
words_y = [token.text for token in self.y]
if words_x == self._cached_words_x and words_y == self._cached_words_y:
self._x_sig = x_sig
self._y_sig = y_sig
return self._cached_alignment
else:
self._cached_alignment = Alignment.from_strings(words_x, words_y)
self._cached_words_x = words_x
self._cached_words_y = words_y
self._x_sig = x_sig
self._y_sig = y_sig
return self._cached_alignment
def get_aligned(self, field, as_string=False): def get_aligned(self, field, as_string=False):
"""Return an aligned array for a token attribute.""" """Return an aligned array for a token attribute."""
@ -288,7 +310,6 @@ def _annot2array(vocab, tok_annot, doc_annot):
def _add_entities_to_doc(doc, ner_data): def _add_entities_to_doc(doc, ner_data):
print(ner_data)
if ner_data is None: if ner_data is None:
return return
elif ner_data == []: elif ner_data == []:

View File

@ -1233,8 +1233,13 @@ def combine_score_weights(
# components. # components.
total = sum(w_dict.values()) total = sum(w_dict.values())
for key, value in w_dict.items(): for key, value in w_dict.items():
weight = round(value / total / len(all_weights), 2) if total == 0:
result[key] = result.get(key, 0.0) + weight weight = 0.0
else:
weight = round(value / total / len(all_weights), 2)
prev_weight = result.get(key, 0.0)
prev_weight = 0.0 if prev_weight is None else prev_weight
result[key] = prev_weight + weight
return result return result

View File

@ -225,6 +225,21 @@ pipe's entity linking model and context encoder. Delegates to
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | | `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | | **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
## EntityLinker.score {#score tag="method" new="3"}
Score a batch of examples.
> #### Example
>
> ```python
> scores = entity_linker.score(examples)
> ```
| Name | Description |
| ----------- | ---------------------------------------------------------------------------------------------- |
| `examples` | The examples to score. ~~Iterable[Example]~~ |
| **RETURNS** | The scores, produced by [`Scorer.score_links`](/api/scorer#score_links) . ~~Dict[str, float]~~ |
## EntityLinker.create_optimizer {#create_optimizer tag="method"} ## EntityLinker.create_optimizer {#create_optimizer tag="method"}
Create an optimizer for the pipeline component. Create an optimizer for the pipeline component.

View File

@ -242,10 +242,10 @@ Score a batch of examples.
> scores = ner.score(examples) > scores = ner.score(examples)
> ``` > ```
| Name | Description | | Name | Description |
| ----------- | ---------------------------------------------------------------------------------------------------------------------- | | ----------- | --------------------------------------------------------- |
| `examples` | The examples to score. ~~Iterable[Example]~~ | | `examples` | The examples to score. ~~Iterable[Example]~~ |
| **RETURNS** | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). ~~Dict[str, Union[float, Dict[str, float]]]~~ | | **RETURNS** | The scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
## EntityRecognizer.create_optimizer {#create_optimizer tag="method"} ## EntityRecognizer.create_optimizer {#create_optimizer tag="method"}

View File

@ -30,20 +30,20 @@ pattern keys correspond to a number of
[`Token` attributes](/api/token#attributes). The supported attributes for [`Token` attributes](/api/token#attributes). The supported attributes for
rule-based matching are: rule-based matching are:
| Attribute |  Description | | Attribute |  Description |
| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- | | ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
| `ORTH` | The exact verbatim text of a token. ~~str~~ | | `ORTH` | The exact verbatim text of a token. ~~str~~ |
| `TEXT` <Tag variant="new">2.1</Tag> | The exact verbatim text of a token. ~~str~~ | | `TEXT` <Tag variant="new">2.1</Tag> | The exact verbatim text of a token. ~~str~~ |
| `LOWER` | The lowercase form of the token text. ~~str~~ | | `LOWER` | The lowercase form of the token text. ~~str~~ |
|  `LENGTH` | The length of the token text. ~~int~~ | |  `LENGTH` | The length of the token text. ~~int~~ |
|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~ | |  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~ |
|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | Token text is in lowercase, uppercase, titlecase. ~~bool~~ | |  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | Token text is in lowercase, uppercase, titlecase. ~~bool~~ |
|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ | |  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ |
|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ | |  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ |
|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. ~~str~~ | |  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~ |
| `ENT_TYPE` | The token's entity label. ~~str~~ | | `ENT_TYPE` | The token's entity label. ~~str~~ |
| `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ | | `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
| `OP` | Operator or quantifier to determine how often to match a token pattern. ~~str~~ | | `OP` | Operator or quantifier to determine how often to match a token pattern. ~~str~~ |
Operators and quantifiers define **how often** a token pattern should be Operators and quantifiers define **how often** a token pattern should be
matched: matched:
@ -79,6 +79,8 @@ it compares to another value.
| -------------------------- | ------------------------------------------------------------------------------------------------------- | | -------------------------- | ------------------------------------------------------------------------------------------------------- |
| `IN` | Attribute value is member of a list. ~~Any~~ | | `IN` | Attribute value is member of a list. ~~Any~~ |
| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ | | `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ |
| `ISSUBSET` | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~ |
| `ISSUPERSET` | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~ |
| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ | | `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
## Matcher.\_\_init\_\_ {#init tag="method"} ## Matcher.\_\_init\_\_ {#init tag="method"}

View File

@ -206,3 +206,26 @@ depends on the scorer settings:
| `multi_label` | Whether the attribute allows multiple labels. Defaults to `True`. ~~bool~~ | | `multi_label` | Whether the attribute allows multiple labels. Defaults to `True`. ~~bool~~ |
| `positive_label` | The positive label for a binary task with exclusive classes. Defaults to `None`. ~~Optional[str]~~ | | `positive_label` | The positive label for a binary task with exclusive classes. Defaults to `None`. ~~Optional[str]~~ |
| **RETURNS** | A dictionary containing the scores, with inapplicable scores as `None`. ~~Dict[str, Optional[float]]~~ | | **RETURNS** | A dictionary containing the scores, with inapplicable scores as `None`. ~~Dict[str, Optional[float]]~~ |
## Scorer.score_links {#score_links tag="staticmethod" new="3"}
Returns PRF for predicted links on the entity level. To disentangle the
performance of the NEL from the NER, this method only evaluates NEL links for
entities that overlap between the gold reference and the predictions.
> #### Example
>
> ```python
> scores = Scorer.score_links(
> examples,
> negative_labels=["NIL", ""]
> )
> print(scores["nel_micro_f"])
> ```
| Name | Description |
| ----------------- | ------------------------------------------------------------------------------------------------------------------- |
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
| _keyword-only_ | |
| `negative_labels` | The string values that refer to no annotation (e.g. "NIL"). ~~Iterable[str]~~ |
| **RETURNS** | A dictionary containing the scores. ~~Dict[str, Optional[float]]~~ |

View File

@ -158,20 +158,20 @@ The available token pattern keys correspond to a number of
[`Token` attributes](/api/token#attributes). The supported attributes for [`Token` attributes](/api/token#attributes). The supported attributes for
rule-based matching are: rule-based matching are:
| Attribute |  Description | | Attribute |  Description |
| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- | | ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
| `ORTH` | The exact verbatim text of a token. ~~str~~ | | `ORTH` | The exact verbatim text of a token. ~~str~~ |
| `TEXT` <Tag variant="new">2.1</Tag> | The exact verbatim text of a token. ~~str~~ | | `TEXT` <Tag variant="new">2.1</Tag> | The exact verbatim text of a token. ~~str~~ |
| `LOWER` | The lowercase form of the token text. ~~str~~ | | `LOWER` | The lowercase form of the token text. ~~str~~ |
|  `LENGTH` | The length of the token text. ~~int~~ | |  `LENGTH` | The length of the token text. ~~int~~ |
|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~ | |  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~ |
|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | Token text is in lowercase, uppercase, titlecase. ~~bool~~ | |  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | Token text is in lowercase, uppercase, titlecase. ~~bool~~ |
|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ | |  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ |
|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ | |  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ |
|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. ~~str~~ | |  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~ |
| `ENT_TYPE` | The token's entity label. ~~str~~ | | `ENT_TYPE` | The token's entity label. ~~str~~ |
| `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ | | `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
| `OP` | [Operator or quantifier](#quantifiers) to determine how often to match a token pattern. ~~str~~ | | `OP` | [Operator or quantifier](#quantifiers) to determine how often to match a token pattern. ~~str~~ |
<Accordion title="Does it matter if the attribute names are uppercase or lowercase?"> <Accordion title="Does it matter if the attribute names are uppercase or lowercase?">
@ -236,6 +236,8 @@ following rich comparison attributes are available:
| -------------------------- | ------------------------------------------------------------------------------------------------------- | | -------------------------- | ------------------------------------------------------------------------------------------------------- |
| `IN` | Attribute value is member of a list. ~~Any~~ | | `IN` | Attribute value is member of a list. ~~Any~~ |
| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ | | `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ |
| `ISSUBSET` | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~ |
| `ISSUPERSET` | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~ |
| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ | | `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
#### Regular expressions {#regex new="2.1"} #### Regular expressions {#regex new="2.1"}

View File

@ -11,12 +11,24 @@ import { Table, Tr, Td, Th } from '../components/table'
import Tag from '../components/tag' import Tag from '../components/tag'
import { H2, Label } from '../components/typography' import { H2, Label } from '../components/typography'
import Icon from '../components/icon' import Icon from '../components/icon'
import Link from '../components/link' import Link, { OptionalLink } from '../components/link'
import Infobox from '../components/infobox' import Infobox from '../components/infobox'
import Accordion from '../components/accordion' import Accordion from '../components/accordion'
import { join, arrayToObj, abbrNum, markdownToReact } from '../components/util' import { join, arrayToObj, abbrNum, markdownToReact } from '../components/util'
import { isString, isEmptyObj } from '../components/util' import { isString, isEmptyObj } from '../components/util'
const COMPONENT_LINKS = {
tok2vec: '/api/tok2vec',
transformer: '/api/transformer',
tagger: '/api/tagger',
parser: '/api/dependencyparser',
ner: '/api/entityrecognizer',
lemmatizer: '/api/lemmatizer',
attribute_ruler: '/api/attributeruler',
senter: '/api/sentencerecognizer',
morphologizer: '/api/morphologizer',
}
const MODEL_META = { const MODEL_META = {
core: 'Vocabulary, syntax, entities, vectors', core: 'Vocabulary, syntax, entities, vectors',
core_sm: 'Vocabulary, syntax, entities', core_sm: 'Vocabulary, syntax, entities',
@ -78,10 +90,15 @@ function isStableVersion(v) {
return !v.includes('a') && !v.includes('b') && !v.includes('dev') && !v.includes('rc') return !v.includes('a') && !v.includes('b') && !v.includes('dev') && !v.includes('rc')
} }
function getLatestVersion(modelId, compatibility) { function getLatestVersion(modelId, compatibility, prereleases) {
for (let [version, models] of Object.entries(compatibility)) { for (let [version, models] of Object.entries(compatibility)) {
if (isStableVersion(version) && models[modelId]) { if (isStableVersion(version) && models[modelId]) {
return models[modelId][0] const modelVersions = models[modelId]
for (let modelVersion of modelVersions) {
if (isStableVersion(modelVersion) || prereleases) {
return modelVersion
}
}
} }
} }
} }
@ -141,18 +158,44 @@ function formatSources(data = []) {
)) ))
} }
function linkComponents(components = []) {
return join(
components.map(c => (
<Fragment key={c}>
<OptionalLink to={COMPONENT_LINKS[c]} hideIcon>
<InlineCode>{c}</InlineCode>
</OptionalLink>
</Fragment>
))
)
}
const Help = ({ children }) => ( const Help = ({ children }) => (
<span data-tooltip={children}> <span data-tooltip={children}>
<Icon name="help2" width={16} variant="subtle" inline /> <Icon name="help2" width={16} variant="subtle" inline />
</span> </span>
) )
const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExamples, licenses }) => { const Model = ({
name,
langId,
langName,
baseUrl,
repo,
compatibility,
hasExamples,
licenses,
prereleases,
}) => {
const [initialized, setInitialized] = useState(false) const [initialized, setInitialized] = useState(false)
const [isError, setIsError] = useState(true) const [isError, setIsError] = useState(true)
const [meta, setMeta] = useState({}) const [meta, setMeta] = useState({})
const { type, genre, size } = getModelComponents(name) const { type, genre, size } = getModelComponents(name)
const version = useMemo(() => getLatestVersion(name, compatibility), [name, compatibility]) const version = useMemo(() => getLatestVersion(name, compatibility, prereleases), [
name,
compatibility,
prereleases,
])
useEffect(() => { useEffect(() => {
window.dispatchEvent(new Event('resize')) // scroll position for progress window.dispatchEvent(new Event('resize')) // scroll position for progress
@ -173,10 +216,8 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
const releaseTag = meta.fullName ? `/tag/${meta.fullName}` : '' const releaseTag = meta.fullName ? `/tag/${meta.fullName}` : ''
const releaseUrl = `https://github.com/${repo}/releases/${releaseTag}` const releaseUrl = `https://github.com/${repo}/releases/${releaseTag}`
const pipeline = const pipeline = linkComponents(meta.pipeline)
meta.pipeline && join(meta.pipeline.map(p => <InlineCode key={p}>{p}</InlineCode>)) const components = linkComponents(meta.components)
const components =
meta.components && join(meta.components.map(p => <InlineCode key={p}>{p}</InlineCode>))
const sources = formatSources(meta.sources) const sources = formatSources(meta.sources)
const author = !meta.url ? meta.author : <Link to={meta.url}>{meta.author}</Link> const author = !meta.url ? meta.author : <Link to={meta.url}>{meta.author}</Link>
const licenseUrl = licenses[meta.license] ? licenses[meta.license].url : null const licenseUrl = licenses[meta.license] ? licenses[meta.license].url : null
@ -332,7 +373,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
const Models = ({ pageContext, repo, children }) => { const Models = ({ pageContext, repo, children }) => {
const [initialized, setInitialized] = useState(false) const [initialized, setInitialized] = useState(false)
const [compatibility, setCompatibility] = useState({}) const [compatibility, setCompatibility] = useState({})
const { id, title, meta, hasExamples } = pageContext const { id, title, meta } = pageContext
const { models, isStarters } = meta const { models, isStarters } = meta
const baseUrl = `https://raw.githubusercontent.com/${repo}/master` const baseUrl = `https://raw.githubusercontent.com/${repo}/master`
@ -381,6 +422,7 @@ const Models = ({ pageContext, repo, children }) => {
repo={repo} repo={repo}
licenses={arrayToObj(site.siteMetadata.licenses, 'id')} licenses={arrayToObj(site.siteMetadata.licenses, 'id')}
hasExamples={meta.hasExamples} hasExamples={meta.hasExamples}
prereleases={site.siteMetadata.nightly}
/> />
)) ))
} }
@ -397,6 +439,7 @@ const query = graphql`
query ModelsQuery { query ModelsQuery {
site { site {
siteMetadata { siteMetadata {
nightly
licenses { licenses {
id id
url url