mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 20:28:20 +03:00
Merge branch 'master' into spacy.io
This commit is contained in:
commit
d8fcebf386
|
@ -4,13 +4,13 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
|
|
||||||
__title__ = "spacy"
|
__title__ = "spacy"
|
||||||
__version__ = "2.1.7.dev0"
|
__version__ = "2.1.7"
|
||||||
__summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
|
__summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
|
||||||
__uri__ = "https://spacy.io"
|
__uri__ = "https://spacy.io"
|
||||||
__author__ = "Explosion AI"
|
__author__ = "Explosion AI"
|
||||||
__email__ = "contact@explosion.ai"
|
__email__ = "contact@explosion.ai"
|
||||||
__license__ = "MIT"
|
__license__ = "MIT"
|
||||||
__release__ = False
|
__release__ = True
|
||||||
|
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
|
|
|
@ -24,6 +24,7 @@ except ImportError:
|
||||||
ftfy = None
|
ftfy = None
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_OOV_PROB = -20
|
||||||
msg = Printer()
|
msg = Printer()
|
||||||
|
|
||||||
|
|
||||||
|
@ -108,14 +109,21 @@ def open_file(loc):
|
||||||
|
|
||||||
|
|
||||||
def read_attrs_from_deprecated(freqs_loc, clusters_loc):
|
def read_attrs_from_deprecated(freqs_loc, clusters_loc):
|
||||||
|
if freqs_loc is not None:
|
||||||
with msg.loading("Counting frequencies..."):
|
with msg.loading("Counting frequencies..."):
|
||||||
probs, oov_prob = read_freqs(freqs_loc) if freqs_loc is not None else ({}, -20)
|
probs, _ = read_freqs(freqs_loc)
|
||||||
msg.good("Counted frequencies")
|
msg.good("Counted frequencies")
|
||||||
|
else:
|
||||||
|
probs, _ = ({}, DEFAULT_OOV_PROB)
|
||||||
|
if clusters_loc:
|
||||||
with msg.loading("Reading clusters..."):
|
with msg.loading("Reading clusters..."):
|
||||||
clusters = read_clusters(clusters_loc) if clusters_loc else {}
|
clusters = read_clusters(clusters_loc)
|
||||||
msg.good("Read clusters")
|
msg.good("Read clusters")
|
||||||
|
else:
|
||||||
|
clusters = {}
|
||||||
lex_attrs = []
|
lex_attrs = []
|
||||||
sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True)
|
sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True)
|
||||||
|
if len(sorted_probs):
|
||||||
for i, (word, prob) in tqdm(enumerate(sorted_probs)):
|
for i, (word, prob) in tqdm(enumerate(sorted_probs)):
|
||||||
attrs = {"orth": word, "id": i, "prob": prob}
|
attrs = {"orth": word, "id": i, "prob": prob}
|
||||||
# Decode as a little-endian string, so that we can do & 15 to get
|
# Decode as a little-endian string, so that we can do & 15 to get
|
||||||
|
@ -142,8 +150,11 @@ def create_model(lang, lex_attrs):
|
||||||
lexeme.is_oov = False
|
lexeme.is_oov = False
|
||||||
lex_added += 1
|
lex_added += 1
|
||||||
lex_added += 1
|
lex_added += 1
|
||||||
oov_prob = min(lex.prob for lex in nlp.vocab)
|
if len(nlp.vocab):
|
||||||
nlp.vocab.cfg.update({"oov_prob": oov_prob - 1})
|
oov_prob = min(lex.prob for lex in nlp.vocab) - 1
|
||||||
|
else:
|
||||||
|
oov_prob = DEFAULT_OOV_PROB
|
||||||
|
nlp.vocab.cfg.update({"oov_prob": oov_prob})
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -415,6 +415,8 @@ class Errors(object):
|
||||||
"is assigned to a KB identifier.")
|
"is assigned to a KB identifier.")
|
||||||
E149 = ("Error deserializing model. Check that the config used to create the "
|
E149 = ("Error deserializing model. Check that the config used to create the "
|
||||||
"component matches the model being loaded.")
|
"component matches the model being loaded.")
|
||||||
|
E150 = ("The language of the `nlp` object and the `vocab` should be the same, "
|
||||||
|
"but found '{nlp}' and '{vocab}' respectively.")
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
class TempErrors(object):
|
class TempErrors(object):
|
||||||
|
|
|
@ -14,7 +14,8 @@ import srsly
|
||||||
from .tokenizer import Tokenizer
|
from .tokenizer import Tokenizer
|
||||||
from .vocab import Vocab
|
from .vocab import Vocab
|
||||||
from .lemmatizer import Lemmatizer
|
from .lemmatizer import Lemmatizer
|
||||||
from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer, EntityLinker
|
from .pipeline import DependencyParser, Tagger
|
||||||
|
from .pipeline import Tensorizer, EntityRecognizer, EntityLinker
|
||||||
from .pipeline import SimilarityHook, TextCategorizer, Sentencizer
|
from .pipeline import SimilarityHook, TextCategorizer, Sentencizer
|
||||||
from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
|
from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
|
||||||
from .pipeline import EntityRuler
|
from .pipeline import EntityRuler
|
||||||
|
@ -158,6 +159,9 @@ class Language(object):
|
||||||
vocab = factory(self, **meta.get("vocab", {}))
|
vocab = factory(self, **meta.get("vocab", {}))
|
||||||
if vocab.vectors.name is None:
|
if vocab.vectors.name is None:
|
||||||
vocab.vectors.name = meta.get("vectors", {}).get("name")
|
vocab.vectors.name = meta.get("vectors", {}).get("name")
|
||||||
|
else:
|
||||||
|
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
|
||||||
|
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
if make_doc is True:
|
if make_doc is True:
|
||||||
factory = self.Defaults.create_tokenizer
|
factory = self.Defaults.create_tokenizer
|
||||||
|
@ -173,7 +177,10 @@ class Language(object):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def meta(self):
|
def meta(self):
|
||||||
|
if self.vocab.lang:
|
||||||
self._meta.setdefault("lang", self.vocab.lang)
|
self._meta.setdefault("lang", self.vocab.lang)
|
||||||
|
else:
|
||||||
|
self._meta.setdefault("lang", self.lang)
|
||||||
self._meta.setdefault("name", "model")
|
self._meta.setdefault("name", "model")
|
||||||
self._meta.setdefault("version", "0.0.0")
|
self._meta.setdefault("version", "0.0.0")
|
||||||
self._meta.setdefault("spacy_version", ">={}".format(about.__version__))
|
self._meta.setdefault("spacy_version", ">={}".format(about.__version__))
|
||||||
|
@ -618,7 +625,9 @@ class Language(object):
|
||||||
if component_cfg is None:
|
if component_cfg is None:
|
||||||
component_cfg = {}
|
component_cfg = {}
|
||||||
docs, golds = zip(*docs_golds)
|
docs, golds = zip(*docs_golds)
|
||||||
docs = [self.make_doc(doc) if isinstance(doc, basestring_) else doc for doc in docs]
|
docs = [
|
||||||
|
self.make_doc(doc) if isinstance(doc, basestring_) else doc for doc in docs
|
||||||
|
]
|
||||||
golds = list(golds)
|
golds = list(golds)
|
||||||
for name, pipe in self.pipeline:
|
for name, pipe in self.pipeline:
|
||||||
kwargs = component_cfg.get(name, {})
|
kwargs = component_cfg.get(name, {})
|
||||||
|
@ -769,8 +778,12 @@ class Language(object):
|
||||||
exclude = disable
|
exclude = disable
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
serializers = OrderedDict()
|
serializers = OrderedDict()
|
||||||
serializers["tokenizer"] = lambda p: self.tokenizer.to_disk(p, exclude=["vocab"])
|
serializers["tokenizer"] = lambda p: self.tokenizer.to_disk(
|
||||||
serializers["meta.json"] = lambda p: p.open("w").write(srsly.json_dumps(self.meta))
|
p, exclude=["vocab"]
|
||||||
|
)
|
||||||
|
serializers["meta.json"] = lambda p: p.open("w").write(
|
||||||
|
srsly.json_dumps(self.meta)
|
||||||
|
)
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if not hasattr(proc, "name"):
|
if not hasattr(proc, "name"):
|
||||||
continue
|
continue
|
||||||
|
@ -799,14 +812,20 @@ class Language(object):
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
deserializers = OrderedDict()
|
deserializers = OrderedDict()
|
||||||
deserializers["meta.json"] = lambda p: self.meta.update(srsly.read_json(p))
|
deserializers["meta.json"] = lambda p: self.meta.update(srsly.read_json(p))
|
||||||
deserializers["vocab"] = lambda p: self.vocab.from_disk(p) and _fix_pretrained_vectors_name(self)
|
deserializers["vocab"] = lambda p: self.vocab.from_disk(
|
||||||
deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(p, exclude=["vocab"])
|
p
|
||||||
|
) and _fix_pretrained_vectors_name(self)
|
||||||
|
deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(
|
||||||
|
p, exclude=["vocab"]
|
||||||
|
)
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if name in exclude:
|
if name in exclude:
|
||||||
continue
|
continue
|
||||||
if not hasattr(proc, "from_disk"):
|
if not hasattr(proc, "from_disk"):
|
||||||
continue
|
continue
|
||||||
deserializers[name] = lambda p, proc=proc: proc.from_disk(p, exclude=["vocab"])
|
deserializers[name] = lambda p, proc=proc: proc.from_disk(
|
||||||
|
p, exclude=["vocab"]
|
||||||
|
)
|
||||||
if not (path / "vocab").exists() and "vocab" not in exclude:
|
if not (path / "vocab").exists() and "vocab" not in exclude:
|
||||||
# Convert to list here in case exclude is (default) tuple
|
# Convert to list here in case exclude is (default) tuple
|
||||||
exclude = list(exclude) + ["vocab"]
|
exclude = list(exclude) + ["vocab"]
|
||||||
|
@ -852,14 +871,20 @@ class Language(object):
|
||||||
exclude = disable
|
exclude = disable
|
||||||
deserializers = OrderedDict()
|
deserializers = OrderedDict()
|
||||||
deserializers["meta.json"] = lambda b: self.meta.update(srsly.json_loads(b))
|
deserializers["meta.json"] = lambda b: self.meta.update(srsly.json_loads(b))
|
||||||
deserializers["vocab"] = lambda b: self.vocab.from_bytes(b) and _fix_pretrained_vectors_name(self)
|
deserializers["vocab"] = lambda b: self.vocab.from_bytes(
|
||||||
deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes(b, exclude=["vocab"])
|
b
|
||||||
|
) and _fix_pretrained_vectors_name(self)
|
||||||
|
deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes(
|
||||||
|
b, exclude=["vocab"]
|
||||||
|
)
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if name in exclude:
|
if name in exclude:
|
||||||
continue
|
continue
|
||||||
if not hasattr(proc, "from_bytes"):
|
if not hasattr(proc, "from_bytes"):
|
||||||
continue
|
continue
|
||||||
deserializers[name] = lambda b, proc=proc: proc.from_bytes(b, exclude=["vocab"])
|
deserializers[name] = lambda b, proc=proc: proc.from_bytes(
|
||||||
|
b, exclude=["vocab"]
|
||||||
|
)
|
||||||
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
|
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
|
||||||
util.from_bytes(bytes_data, deserializers, exclude)
|
util.from_bytes(bytes_data, deserializers, exclude)
|
||||||
return self
|
return self
|
||||||
|
|
|
@ -66,8 +66,12 @@ class Pipe(object):
|
||||||
and `set_annotations()` methods.
|
and `set_annotations()` methods.
|
||||||
"""
|
"""
|
||||||
self.require_model()
|
self.require_model()
|
||||||
scores, tensors = self.predict([doc])
|
predictions = self.predict([doc])
|
||||||
self.set_annotations([doc], scores, tensors=tensors)
|
if isinstance(predictions, tuple) and len(tuple) == 2:
|
||||||
|
scores, tensors = predictions
|
||||||
|
self.set_annotations([doc], scores, tensor=tensors)
|
||||||
|
else:
|
||||||
|
self.set_annotations([doc], predictions)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def require_model(self):
|
def require_model(self):
|
||||||
|
|
|
@ -159,12 +159,19 @@ class Scorer(object):
|
||||||
else:
|
else:
|
||||||
cand_deps.add((gold_i, gold_head, token.dep_.lower()))
|
cand_deps.add((gold_i, gold_head, token.dep_.lower()))
|
||||||
if "-" not in [token[-1] for token in gold.orig_annot]:
|
if "-" not in [token[-1] for token in gold.orig_annot]:
|
||||||
|
# Find all NER labels in gold and doc
|
||||||
|
ent_labels = set([x[0] for x in gold_ents]
|
||||||
|
+ [k.label_ for k in doc.ents])
|
||||||
|
# Set up all labels for per type scoring and prepare gold per type
|
||||||
|
gold_per_ents = {ent_label: set() for ent_label in ent_labels}
|
||||||
|
for ent_label in ent_labels:
|
||||||
|
if ent_label not in self.ner_per_ents:
|
||||||
|
self.ner_per_ents[ent_label] = PRFScore()
|
||||||
|
gold_per_ents[ent_label].update([x for x in gold_ents if x[0] == ent_label])
|
||||||
|
# Find all candidate labels, for all and per type
|
||||||
cand_ents = set()
|
cand_ents = set()
|
||||||
current_ent = {k.label_: set() for k in doc.ents}
|
cand_per_ents = {ent_label: set() for ent_label in ent_labels}
|
||||||
current_gold = {k.label_: set() for k in doc.ents}
|
|
||||||
for ent in doc.ents:
|
for ent in doc.ents:
|
||||||
if ent.label_ not in self.ner_per_ents:
|
|
||||||
self.ner_per_ents[ent.label_] = PRFScore()
|
|
||||||
first = gold.cand_to_gold[ent.start]
|
first = gold.cand_to_gold[ent.start]
|
||||||
last = gold.cand_to_gold[ent.end - 1]
|
last = gold.cand_to_gold[ent.end - 1]
|
||||||
if first is None or last is None:
|
if first is None or last is None:
|
||||||
|
@ -172,14 +179,11 @@ class Scorer(object):
|
||||||
self.ner_per_ents[ent.label_].fp += 1
|
self.ner_per_ents[ent.label_].fp += 1
|
||||||
else:
|
else:
|
||||||
cand_ents.add((ent.label_, first, last))
|
cand_ents.add((ent.label_, first, last))
|
||||||
current_ent[ent.label_].update([x for x in cand_ents if x[0] == ent.label_])
|
cand_per_ents[ent.label_].add((ent.label_, first, last))
|
||||||
current_gold[ent.label_].update([x for x in gold_ents if x[0] == ent.label_])
|
|
||||||
# Scores per ent
|
# Scores per ent
|
||||||
[
|
for k, v in self.ner_per_ents.items():
|
||||||
v.score_set(current_ent[k], current_gold[k])
|
if k in cand_per_ents:
|
||||||
for k, v in self.ner_per_ents.items()
|
v.score_set(cand_per_ents[k], gold_per_ents[k])
|
||||||
if k in current_ent
|
|
||||||
]
|
|
||||||
# Score for all ents
|
# Score for all ents
|
||||||
self.ner.score_set(cand_ents, gold_ents)
|
self.ner.score_set(cand_ents, gold_ents)
|
||||||
self.tags.score_set(cand_tags, gold_tags)
|
self.tags.score_set(cand_tags, gold_tags)
|
||||||
|
|
|
@ -1,34 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from spacy.gold import GoldParse
|
|
||||||
from spacy.scorer import Scorer
|
|
||||||
from ..util import get_doc
|
|
||||||
|
|
||||||
test_samples = [
|
|
||||||
[
|
|
||||||
"100 - 200",
|
|
||||||
{
|
|
||||||
"entities": [
|
|
||||||
[0, 3, "CARDINAL"],
|
|
||||||
[6, 9, "CARDINAL"]
|
|
||||||
]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
]
|
|
||||||
|
|
||||||
def test_issue3625(en_vocab):
|
|
||||||
scorer = Scorer()
|
|
||||||
for input_, annot in test_samples:
|
|
||||||
doc = get_doc(en_vocab, words = input_.split(' '), ents = [[0,1,'CARDINAL'], [2,3,'CARDINAL']]);
|
|
||||||
gold = GoldParse(doc, entities = annot['entities'])
|
|
||||||
scorer.score(doc, gold)
|
|
||||||
results = scorer.scores
|
|
||||||
|
|
||||||
# Expects total accuracy and accuracy for each each entity to be 100%
|
|
||||||
assert results['ents_p'] == 100
|
|
||||||
assert results['ents_f'] == 100
|
|
||||||
assert results['ents_r'] == 100
|
|
||||||
assert results['ents_per_type']['CARDINAL']['p'] == 100
|
|
||||||
assert results['ents_per_type']['CARDINAL']['f'] == 100
|
|
||||||
assert results['ents_per_type']['CARDINAL']['r'] == 100
|
|
33
spacy/tests/regression/test_issue4054.py
Normal file
33
spacy/tests/regression/test_issue4054.py
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from spacy.vocab import Vocab
|
||||||
|
|
||||||
|
import spacy
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from spacy.tests.util import make_tempdir
|
||||||
|
from spacy.util import ensure_path
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4054(en_vocab):
|
||||||
|
"""Test that a new blank model can be made with a vocab from file,
|
||||||
|
and that serialization does not drop the language at any point."""
|
||||||
|
nlp1 = English()
|
||||||
|
vocab1 = nlp1.vocab
|
||||||
|
|
||||||
|
with make_tempdir() as d:
|
||||||
|
vocab_dir = ensure_path(d / "vocab")
|
||||||
|
if not vocab_dir.exists():
|
||||||
|
vocab_dir.mkdir()
|
||||||
|
vocab1.to_disk(vocab_dir)
|
||||||
|
|
||||||
|
vocab2 = Vocab().from_disk(vocab_dir)
|
||||||
|
print("lang", vocab2.lang)
|
||||||
|
nlp2 = spacy.blank("en", vocab=vocab2)
|
||||||
|
|
||||||
|
nlp_dir = ensure_path(d / "nlp")
|
||||||
|
if not nlp_dir.exists():
|
||||||
|
nlp_dir.mkdir()
|
||||||
|
nlp2.to_disk(nlp_dir)
|
||||||
|
nlp3 = spacy.load(nlp_dir)
|
||||||
|
assert nlp3.lang == "en"
|
73
spacy/tests/test_scorer.py
Normal file
73
spacy/tests/test_scorer.py
Normal file
|
@ -0,0 +1,73 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from pytest import approx
|
||||||
|
from spacy.gold import GoldParse
|
||||||
|
from spacy.scorer import Scorer
|
||||||
|
from .util import get_doc
|
||||||
|
|
||||||
|
test_ner_cardinal = [
|
||||||
|
[
|
||||||
|
"100 - 200",
|
||||||
|
{
|
||||||
|
"entities": [
|
||||||
|
[0, 3, "CARDINAL"],
|
||||||
|
[6, 9, "CARDINAL"]
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
]
|
||||||
|
|
||||||
|
test_ner_apple = [
|
||||||
|
[
|
||||||
|
"Apple is looking at buying U.K. startup for $1 billion",
|
||||||
|
{
|
||||||
|
"entities": [
|
||||||
|
(0, 5, "ORG"),
|
||||||
|
(27, 31, "GPE"),
|
||||||
|
(44, 54, "MONEY"),
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
]
|
||||||
|
|
||||||
|
def test_ner_per_type(en_vocab):
|
||||||
|
# Gold and Doc are identical
|
||||||
|
scorer = Scorer()
|
||||||
|
for input_, annot in test_ner_cardinal:
|
||||||
|
doc = get_doc(en_vocab, words = input_.split(' '), ents = [[0, 1, 'CARDINAL'], [2, 3, 'CARDINAL']])
|
||||||
|
gold = GoldParse(doc, entities = annot['entities'])
|
||||||
|
scorer.score(doc, gold)
|
||||||
|
results = scorer.scores
|
||||||
|
|
||||||
|
assert results['ents_p'] == 100
|
||||||
|
assert results['ents_f'] == 100
|
||||||
|
assert results['ents_r'] == 100
|
||||||
|
assert results['ents_per_type']['CARDINAL']['p'] == 100
|
||||||
|
assert results['ents_per_type']['CARDINAL']['f'] == 100
|
||||||
|
assert results['ents_per_type']['CARDINAL']['r'] == 100
|
||||||
|
|
||||||
|
# Doc has one missing and one extra entity
|
||||||
|
# Entity type MONEY is not present in Doc
|
||||||
|
scorer = Scorer()
|
||||||
|
for input_, annot in test_ner_apple:
|
||||||
|
doc = get_doc(en_vocab, words = input_.split(' '), ents = [[0, 1, 'ORG'], [5, 6, 'GPE'], [6, 7, 'ORG']])
|
||||||
|
gold = GoldParse(doc, entities = annot['entities'])
|
||||||
|
scorer.score(doc, gold)
|
||||||
|
results = scorer.scores
|
||||||
|
|
||||||
|
assert results['ents_p'] == approx(66.66666)
|
||||||
|
assert results['ents_r'] == approx(66.66666)
|
||||||
|
assert results['ents_f'] == approx(66.66666)
|
||||||
|
assert 'GPE' in results['ents_per_type']
|
||||||
|
assert 'MONEY' in results['ents_per_type']
|
||||||
|
assert 'ORG' in results['ents_per_type']
|
||||||
|
assert results['ents_per_type']['GPE']['p'] == 100
|
||||||
|
assert results['ents_per_type']['GPE']['r'] == 100
|
||||||
|
assert results['ents_per_type']['GPE']['f'] == 100
|
||||||
|
assert results['ents_per_type']['MONEY']['p'] == 0
|
||||||
|
assert results['ents_per_type']['MONEY']['r'] == 0
|
||||||
|
assert results['ents_per_type']['MONEY']['f'] == 0
|
||||||
|
assert results['ents_per_type']['ORG']['p'] == 50
|
||||||
|
assert results['ents_per_type']['ORG']['r'] == 100
|
||||||
|
assert results['ents_per_type']['ORG']['f'] == approx(66.66666)
|
|
@ -471,6 +471,17 @@ cdef class Span:
|
||||||
self._vector_norm = xp.sqrt(total) if total != 0. else 0.
|
self._vector_norm = xp.sqrt(total) if total != 0. else 0.
|
||||||
return self._vector_norm
|
return self._vector_norm
|
||||||
|
|
||||||
|
@property
|
||||||
|
def tensor(self):
|
||||||
|
"""The span's slice of the doc's tensor.
|
||||||
|
|
||||||
|
RETURNS (ndarray[ndim=2, dtype='float32']): A 2D numpy or cupy array
|
||||||
|
representing the span's semantics.
|
||||||
|
"""
|
||||||
|
if self.doc.tensor is None:
|
||||||
|
return None
|
||||||
|
return self.doc.tensor[self.start : self.end]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def sentiment(self):
|
def sentiment(self):
|
||||||
"""RETURNS (float): A scalar value indicating the positivity or
|
"""RETURNS (float): A scalar value indicating the positivity or
|
||||||
|
|
|
@ -408,6 +408,12 @@ cdef class Token:
|
||||||
total = (vector ** 2).sum()
|
total = (vector ** 2).sum()
|
||||||
return xp.sqrt(total) if total != 0. else 0.
|
return xp.sqrt(total) if total != 0. else 0.
|
||||||
|
|
||||||
|
@property
|
||||||
|
def tensor(self):
|
||||||
|
if self.doc.tensor is None:
|
||||||
|
return None
|
||||||
|
return self.doc.tensor[self.i]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def n_lefts(self):
|
def n_lefts(self):
|
||||||
"""The number of leftward immediate children of the word, in the
|
"""The number of leftward immediate children of the word, in the
|
||||||
|
|
|
@ -14,7 +14,7 @@ import Icon from '../components/icon'
|
||||||
import Link from '../components/link'
|
import Link from '../components/link'
|
||||||
import Grid from '../components/grid'
|
import Grid from '../components/grid'
|
||||||
import Infobox from '../components/infobox'
|
import Infobox from '../components/infobox'
|
||||||
import { join, arrayToObj, abbrNum } from '../components/util'
|
import { join, arrayToObj, abbrNum, markdownToReact } from '../components/util'
|
||||||
|
|
||||||
const MODEL_META = {
|
const MODEL_META = {
|
||||||
core: 'Vocabulary, syntax, entities, vectors',
|
core: 'Vocabulary, syntax, entities, vectors',
|
||||||
|
@ -43,6 +43,10 @@ const MODEL_META = {
|
||||||
compat: 'Latest compatible model version for your spaCy installation',
|
compat: 'Latest compatible model version for your spaCy installation',
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const MARKDOWN_COMPONENTS = {
|
||||||
|
code: InlineCode,
|
||||||
|
}
|
||||||
|
|
||||||
function getModelComponents(name) {
|
function getModelComponents(name) {
|
||||||
const [lang, type, genre, size] = name.split('_')
|
const [lang, type, genre, size] = name.split('_')
|
||||||
return { lang, type, genre, size }
|
return { lang, type, genre, size }
|
||||||
|
@ -192,10 +196,8 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
|
||||||
python -m spacy download {name}
|
python -m spacy download {name}
|
||||||
</CodeBlock>
|
</CodeBlock>
|
||||||
</Aside>
|
</Aside>
|
||||||
{meta.description && <p>{meta.description}</p>}
|
{meta.description && markdownToReact(meta.description, MARKDOWN_COMPONENTS)}
|
||||||
|
|
||||||
{isError && error}
|
{isError && error}
|
||||||
|
|
||||||
<Table>
|
<Table>
|
||||||
<tbody>
|
<tbody>
|
||||||
{rows.map(({ label, tag, help, content }, i) =>
|
{rows.map(({ label, tag, help, content }, i) =>
|
||||||
|
@ -243,7 +245,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
|
||||||
)
|
)
|
||||||
)}
|
)}
|
||||||
</Grid>
|
</Grid>
|
||||||
{meta.notes && <p>{meta.notes}</p>}
|
{meta.notes && markdownToReact(meta.notes, MARKDOWN_COMPONENTS)}
|
||||||
{hasInteractiveCode && (
|
{hasInteractiveCode && (
|
||||||
<CodeBlock title="Try out the model" lang="python" executable={true}>
|
<CodeBlock title="Try out the model" lang="python" executable={true}>
|
||||||
{[
|
{[
|
||||||
|
|
Loading…
Reference in New Issue
Block a user