Merge branch 'master' into spacy.io

This commit is contained in:
Ines Montani 2019-08-01 18:33:23 +02:00
commit d8fcebf386
12 changed files with 219 additions and 82 deletions

View File

@ -4,13 +4,13 @@
# fmt: off
__title__ = "spacy"
__version__ = "2.1.7.dev0"
__version__ = "2.1.7"
__summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
__uri__ = "https://spacy.io"
__author__ = "Explosion AI"
__email__ = "contact@explosion.ai"
__license__ = "MIT"
__release__ = False
__release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

View File

@ -24,6 +24,7 @@ except ImportError:
ftfy = None
DEFAULT_OOV_PROB = -20
msg = Printer()
@ -108,23 +109,30 @@ def open_file(loc):
def read_attrs_from_deprecated(freqs_loc, clusters_loc):
with msg.loading("Counting frequencies..."):
probs, oov_prob = read_freqs(freqs_loc) if freqs_loc is not None else ({}, -20)
msg.good("Counted frequencies")
with msg.loading("Reading clusters..."):
clusters = read_clusters(clusters_loc) if clusters_loc else {}
msg.good("Read clusters")
if freqs_loc is not None:
with msg.loading("Counting frequencies..."):
probs, _ = read_freqs(freqs_loc)
msg.good("Counted frequencies")
else:
probs, _ = ({}, DEFAULT_OOV_PROB)
if clusters_loc:
with msg.loading("Reading clusters..."):
clusters = read_clusters(clusters_loc)
msg.good("Read clusters")
else:
clusters = {}
lex_attrs = []
sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True)
for i, (word, prob) in tqdm(enumerate(sorted_probs)):
attrs = {"orth": word, "id": i, "prob": prob}
# Decode as a little-endian string, so that we can do & 15 to get
# the first 4 bits. See _parse_features.pyx
if word in clusters:
attrs["cluster"] = int(clusters[word][::-1], 2)
else:
attrs["cluster"] = 0
lex_attrs.append(attrs)
if len(sorted_probs):
for i, (word, prob) in tqdm(enumerate(sorted_probs)):
attrs = {"orth": word, "id": i, "prob": prob}
# Decode as a little-endian string, so that we can do & 15 to get
# the first 4 bits. See _parse_features.pyx
if word in clusters:
attrs["cluster"] = int(clusters[word][::-1], 2)
else:
attrs["cluster"] = 0
lex_attrs.append(attrs)
return lex_attrs
@ -142,8 +150,11 @@ def create_model(lang, lex_attrs):
lexeme.is_oov = False
lex_added += 1
lex_added += 1
oov_prob = min(lex.prob for lex in nlp.vocab)
nlp.vocab.cfg.update({"oov_prob": oov_prob - 1})
if len(nlp.vocab):
oov_prob = min(lex.prob for lex in nlp.vocab) - 1
else:
oov_prob = DEFAULT_OOV_PROB
nlp.vocab.cfg.update({"oov_prob": oov_prob})
return nlp

View File

@ -415,6 +415,8 @@ class Errors(object):
"is assigned to a KB identifier.")
E149 = ("Error deserializing model. Check that the config used to create the "
"component matches the model being loaded.")
E150 = ("The language of the `nlp` object and the `vocab` should be the same, "
"but found '{nlp}' and '{vocab}' respectively.")
@add_codes
class TempErrors(object):

View File

@ -14,7 +14,8 @@ import srsly
from .tokenizer import Tokenizer
from .vocab import Vocab
from .lemmatizer import Lemmatizer
from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer, EntityLinker
from .pipeline import DependencyParser, Tagger
from .pipeline import Tensorizer, EntityRecognizer, EntityLinker
from .pipeline import SimilarityHook, TextCategorizer, Sentencizer
from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
from .pipeline import EntityRuler
@ -158,6 +159,9 @@ class Language(object):
vocab = factory(self, **meta.get("vocab", {}))
if vocab.vectors.name is None:
vocab.vectors.name = meta.get("vectors", {}).get("name")
else:
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
self.vocab = vocab
if make_doc is True:
factory = self.Defaults.create_tokenizer
@ -173,7 +177,10 @@ class Language(object):
@property
def meta(self):
self._meta.setdefault("lang", self.vocab.lang)
if self.vocab.lang:
self._meta.setdefault("lang", self.vocab.lang)
else:
self._meta.setdefault("lang", self.lang)
self._meta.setdefault("name", "model")
self._meta.setdefault("version", "0.0.0")
self._meta.setdefault("spacy_version", ">={}".format(about.__version__))
@ -618,7 +625,9 @@ class Language(object):
if component_cfg is None:
component_cfg = {}
docs, golds = zip(*docs_golds)
docs = [self.make_doc(doc) if isinstance(doc, basestring_) else doc for doc in docs]
docs = [
self.make_doc(doc) if isinstance(doc, basestring_) else doc for doc in docs
]
golds = list(golds)
for name, pipe in self.pipeline:
kwargs = component_cfg.get(name, {})
@ -769,8 +778,12 @@ class Language(object):
exclude = disable
path = util.ensure_path(path)
serializers = OrderedDict()
serializers["tokenizer"] = lambda p: self.tokenizer.to_disk(p, exclude=["vocab"])
serializers["meta.json"] = lambda p: p.open("w").write(srsly.json_dumps(self.meta))
serializers["tokenizer"] = lambda p: self.tokenizer.to_disk(
p, exclude=["vocab"]
)
serializers["meta.json"] = lambda p: p.open("w").write(
srsly.json_dumps(self.meta)
)
for name, proc in self.pipeline:
if not hasattr(proc, "name"):
continue
@ -799,14 +812,20 @@ class Language(object):
path = util.ensure_path(path)
deserializers = OrderedDict()
deserializers["meta.json"] = lambda p: self.meta.update(srsly.read_json(p))
deserializers["vocab"] = lambda p: self.vocab.from_disk(p) and _fix_pretrained_vectors_name(self)
deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(p, exclude=["vocab"])
deserializers["vocab"] = lambda p: self.vocab.from_disk(
p
) and _fix_pretrained_vectors_name(self)
deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(
p, exclude=["vocab"]
)
for name, proc in self.pipeline:
if name in exclude:
continue
if not hasattr(proc, "from_disk"):
continue
deserializers[name] = lambda p, proc=proc: proc.from_disk(p, exclude=["vocab"])
deserializers[name] = lambda p, proc=proc: proc.from_disk(
p, exclude=["vocab"]
)
if not (path / "vocab").exists() and "vocab" not in exclude:
# Convert to list here in case exclude is (default) tuple
exclude = list(exclude) + ["vocab"]
@ -852,14 +871,20 @@ class Language(object):
exclude = disable
deserializers = OrderedDict()
deserializers["meta.json"] = lambda b: self.meta.update(srsly.json_loads(b))
deserializers["vocab"] = lambda b: self.vocab.from_bytes(b) and _fix_pretrained_vectors_name(self)
deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes(b, exclude=["vocab"])
deserializers["vocab"] = lambda b: self.vocab.from_bytes(
b
) and _fix_pretrained_vectors_name(self)
deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes(
b, exclude=["vocab"]
)
for name, proc in self.pipeline:
if name in exclude:
continue
if not hasattr(proc, "from_bytes"):
continue
deserializers[name] = lambda b, proc=proc: proc.from_bytes(b, exclude=["vocab"])
deserializers[name] = lambda b, proc=proc: proc.from_bytes(
b, exclude=["vocab"]
)
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
util.from_bytes(bytes_data, deserializers, exclude)
return self

View File

@ -66,8 +66,12 @@ class Pipe(object):
and `set_annotations()` methods.
"""
self.require_model()
scores, tensors = self.predict([doc])
self.set_annotations([doc], scores, tensors=tensors)
predictions = self.predict([doc])
if isinstance(predictions, tuple) and len(tuple) == 2:
scores, tensors = predictions
self.set_annotations([doc], scores, tensor=tensors)
else:
self.set_annotations([doc], predictions)
return doc
def require_model(self):

View File

@ -159,12 +159,19 @@ class Scorer(object):
else:
cand_deps.add((gold_i, gold_head, token.dep_.lower()))
if "-" not in [token[-1] for token in gold.orig_annot]:
# Find all NER labels in gold and doc
ent_labels = set([x[0] for x in gold_ents]
+ [k.label_ for k in doc.ents])
# Set up all labels for per type scoring and prepare gold per type
gold_per_ents = {ent_label: set() for ent_label in ent_labels}
for ent_label in ent_labels:
if ent_label not in self.ner_per_ents:
self.ner_per_ents[ent_label] = PRFScore()
gold_per_ents[ent_label].update([x for x in gold_ents if x[0] == ent_label])
# Find all candidate labels, for all and per type
cand_ents = set()
current_ent = {k.label_: set() for k in doc.ents}
current_gold = {k.label_: set() for k in doc.ents}
cand_per_ents = {ent_label: set() for ent_label in ent_labels}
for ent in doc.ents:
if ent.label_ not in self.ner_per_ents:
self.ner_per_ents[ent.label_] = PRFScore()
first = gold.cand_to_gold[ent.start]
last = gold.cand_to_gold[ent.end - 1]
if first is None or last is None:
@ -172,14 +179,11 @@ class Scorer(object):
self.ner_per_ents[ent.label_].fp += 1
else:
cand_ents.add((ent.label_, first, last))
current_ent[ent.label_].update([x for x in cand_ents if x[0] == ent.label_])
current_gold[ent.label_].update([x for x in gold_ents if x[0] == ent.label_])
cand_per_ents[ent.label_].add((ent.label_, first, last))
# Scores per ent
[
v.score_set(current_ent[k], current_gold[k])
for k, v in self.ner_per_ents.items()
if k in current_ent
]
for k, v in self.ner_per_ents.items():
if k in cand_per_ents:
v.score_set(cand_per_ents[k], gold_per_ents[k])
# Score for all ents
self.ner.score_set(cand_ents, gold_ents)
self.tags.score_set(cand_tags, gold_tags)

View File

@ -1,34 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from ..util import get_doc
test_samples = [
[
"100 - 200",
{
"entities": [
[0, 3, "CARDINAL"],
[6, 9, "CARDINAL"]
]
}
]
]
def test_issue3625(en_vocab):
scorer = Scorer()
for input_, annot in test_samples:
doc = get_doc(en_vocab, words = input_.split(' '), ents = [[0,1,'CARDINAL'], [2,3,'CARDINAL']]);
gold = GoldParse(doc, entities = annot['entities'])
scorer.score(doc, gold)
results = scorer.scores
# Expects total accuracy and accuracy for each each entity to be 100%
assert results['ents_p'] == 100
assert results['ents_f'] == 100
assert results['ents_r'] == 100
assert results['ents_per_type']['CARDINAL']['p'] == 100
assert results['ents_per_type']['CARDINAL']['f'] == 100
assert results['ents_per_type']['CARDINAL']['r'] == 100

View File

@ -0,0 +1,33 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.vocab import Vocab
import spacy
from spacy.lang.en import English
from spacy.tests.util import make_tempdir
from spacy.util import ensure_path
def test_issue4054(en_vocab):
"""Test that a new blank model can be made with a vocab from file,
and that serialization does not drop the language at any point."""
nlp1 = English()
vocab1 = nlp1.vocab
with make_tempdir() as d:
vocab_dir = ensure_path(d / "vocab")
if not vocab_dir.exists():
vocab_dir.mkdir()
vocab1.to_disk(vocab_dir)
vocab2 = Vocab().from_disk(vocab_dir)
print("lang", vocab2.lang)
nlp2 = spacy.blank("en", vocab=vocab2)
nlp_dir = ensure_path(d / "nlp")
if not nlp_dir.exists():
nlp_dir.mkdir()
nlp2.to_disk(nlp_dir)
nlp3 = spacy.load(nlp_dir)
assert nlp3.lang == "en"

View File

@ -0,0 +1,73 @@
# coding: utf-8
from __future__ import unicode_literals
from pytest import approx
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from .util import get_doc
test_ner_cardinal = [
[
"100 - 200",
{
"entities": [
[0, 3, "CARDINAL"],
[6, 9, "CARDINAL"]
]
}
]
]
test_ner_apple = [
[
"Apple is looking at buying U.K. startup for $1 billion",
{
"entities": [
(0, 5, "ORG"),
(27, 31, "GPE"),
(44, 54, "MONEY"),
]
}
]
]
def test_ner_per_type(en_vocab):
# Gold and Doc are identical
scorer = Scorer()
for input_, annot in test_ner_cardinal:
doc = get_doc(en_vocab, words = input_.split(' '), ents = [[0, 1, 'CARDINAL'], [2, 3, 'CARDINAL']])
gold = GoldParse(doc, entities = annot['entities'])
scorer.score(doc, gold)
results = scorer.scores
assert results['ents_p'] == 100
assert results['ents_f'] == 100
assert results['ents_r'] == 100
assert results['ents_per_type']['CARDINAL']['p'] == 100
assert results['ents_per_type']['CARDINAL']['f'] == 100
assert results['ents_per_type']['CARDINAL']['r'] == 100
# Doc has one missing and one extra entity
# Entity type MONEY is not present in Doc
scorer = Scorer()
for input_, annot in test_ner_apple:
doc = get_doc(en_vocab, words = input_.split(' '), ents = [[0, 1, 'ORG'], [5, 6, 'GPE'], [6, 7, 'ORG']])
gold = GoldParse(doc, entities = annot['entities'])
scorer.score(doc, gold)
results = scorer.scores
assert results['ents_p'] == approx(66.66666)
assert results['ents_r'] == approx(66.66666)
assert results['ents_f'] == approx(66.66666)
assert 'GPE' in results['ents_per_type']
assert 'MONEY' in results['ents_per_type']
assert 'ORG' in results['ents_per_type']
assert results['ents_per_type']['GPE']['p'] == 100
assert results['ents_per_type']['GPE']['r'] == 100
assert results['ents_per_type']['GPE']['f'] == 100
assert results['ents_per_type']['MONEY']['p'] == 0
assert results['ents_per_type']['MONEY']['r'] == 0
assert results['ents_per_type']['MONEY']['f'] == 0
assert results['ents_per_type']['ORG']['p'] == 50
assert results['ents_per_type']['ORG']['r'] == 100
assert results['ents_per_type']['ORG']['f'] == approx(66.66666)

View File

@ -471,6 +471,17 @@ cdef class Span:
self._vector_norm = xp.sqrt(total) if total != 0. else 0.
return self._vector_norm
@property
def tensor(self):
"""The span's slice of the doc's tensor.
RETURNS (ndarray[ndim=2, dtype='float32']): A 2D numpy or cupy array
representing the span's semantics.
"""
if self.doc.tensor is None:
return None
return self.doc.tensor[self.start : self.end]
@property
def sentiment(self):
"""RETURNS (float): A scalar value indicating the positivity or

View File

@ -408,6 +408,12 @@ cdef class Token:
total = (vector ** 2).sum()
return xp.sqrt(total) if total != 0. else 0.
@property
def tensor(self):
if self.doc.tensor is None:
return None
return self.doc.tensor[self.i]
@property
def n_lefts(self):
"""The number of leftward immediate children of the word, in the

View File

@ -14,7 +14,7 @@ import Icon from '../components/icon'
import Link from '../components/link'
import Grid from '../components/grid'
import Infobox from '../components/infobox'
import { join, arrayToObj, abbrNum } from '../components/util'
import { join, arrayToObj, abbrNum, markdownToReact } from '../components/util'
const MODEL_META = {
core: 'Vocabulary, syntax, entities, vectors',
@ -43,6 +43,10 @@ const MODEL_META = {
compat: 'Latest compatible model version for your spaCy installation',
}
const MARKDOWN_COMPONENTS = {
code: InlineCode,
}
function getModelComponents(name) {
const [lang, type, genre, size] = name.split('_')
return { lang, type, genre, size }
@ -192,10 +196,8 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
python -m spacy download {name}
</CodeBlock>
</Aside>
{meta.description && <p>{meta.description}</p>}
{meta.description && markdownToReact(meta.description, MARKDOWN_COMPONENTS)}
{isError && error}
<Table>
<tbody>
{rows.map(({ label, tag, help, content }, i) =>
@ -243,7 +245,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
)
)}
</Grid>
{meta.notes && <p>{meta.notes}</p>}
{meta.notes && markdownToReact(meta.notes, MARKDOWN_COMPONENTS)}
{hasInteractiveCode && (
<CodeBlock title="Try out the model" lang="python" executable={true}>
{[