Merge branch 'master' into spacy.io

This commit is contained in:
Ines Montani 2019-08-01 18:33:23 +02:00
commit d8fcebf386
12 changed files with 219 additions and 82 deletions

View File

@ -4,13 +4,13 @@
# fmt: off # fmt: off
__title__ = "spacy" __title__ = "spacy"
__version__ = "2.1.7.dev0" __version__ = "2.1.7"
__summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython" __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
__uri__ = "https://spacy.io" __uri__ = "https://spacy.io"
__author__ = "Explosion AI" __author__ = "Explosion AI"
__email__ = "contact@explosion.ai" __email__ = "contact@explosion.ai"
__license__ = "MIT" __license__ = "MIT"
__release__ = False __release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

View File

@ -24,6 +24,7 @@ except ImportError:
ftfy = None ftfy = None
DEFAULT_OOV_PROB = -20
msg = Printer() msg = Printer()
@ -108,23 +109,30 @@ def open_file(loc):
def read_attrs_from_deprecated(freqs_loc, clusters_loc): def read_attrs_from_deprecated(freqs_loc, clusters_loc):
with msg.loading("Counting frequencies..."): if freqs_loc is not None:
probs, oov_prob = read_freqs(freqs_loc) if freqs_loc is not None else ({}, -20) with msg.loading("Counting frequencies..."):
msg.good("Counted frequencies") probs, _ = read_freqs(freqs_loc)
with msg.loading("Reading clusters..."): msg.good("Counted frequencies")
clusters = read_clusters(clusters_loc) if clusters_loc else {} else:
msg.good("Read clusters") probs, _ = ({}, DEFAULT_OOV_PROB)
if clusters_loc:
with msg.loading("Reading clusters..."):
clusters = read_clusters(clusters_loc)
msg.good("Read clusters")
else:
clusters = {}
lex_attrs = [] lex_attrs = []
sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True) sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True)
for i, (word, prob) in tqdm(enumerate(sorted_probs)): if len(sorted_probs):
attrs = {"orth": word, "id": i, "prob": prob} for i, (word, prob) in tqdm(enumerate(sorted_probs)):
# Decode as a little-endian string, so that we can do & 15 to get attrs = {"orth": word, "id": i, "prob": prob}
# the first 4 bits. See _parse_features.pyx # Decode as a little-endian string, so that we can do & 15 to get
if word in clusters: # the first 4 bits. See _parse_features.pyx
attrs["cluster"] = int(clusters[word][::-1], 2) if word in clusters:
else: attrs["cluster"] = int(clusters[word][::-1], 2)
attrs["cluster"] = 0 else:
lex_attrs.append(attrs) attrs["cluster"] = 0
lex_attrs.append(attrs)
return lex_attrs return lex_attrs
@ -142,8 +150,11 @@ def create_model(lang, lex_attrs):
lexeme.is_oov = False lexeme.is_oov = False
lex_added += 1 lex_added += 1
lex_added += 1 lex_added += 1
oov_prob = min(lex.prob for lex in nlp.vocab) if len(nlp.vocab):
nlp.vocab.cfg.update({"oov_prob": oov_prob - 1}) oov_prob = min(lex.prob for lex in nlp.vocab) - 1
else:
oov_prob = DEFAULT_OOV_PROB
nlp.vocab.cfg.update({"oov_prob": oov_prob})
return nlp return nlp

View File

@ -415,6 +415,8 @@ class Errors(object):
"is assigned to a KB identifier.") "is assigned to a KB identifier.")
E149 = ("Error deserializing model. Check that the config used to create the " E149 = ("Error deserializing model. Check that the config used to create the "
"component matches the model being loaded.") "component matches the model being loaded.")
E150 = ("The language of the `nlp` object and the `vocab` should be the same, "
"but found '{nlp}' and '{vocab}' respectively.")
@add_codes @add_codes
class TempErrors(object): class TempErrors(object):

View File

@ -14,7 +14,8 @@ import srsly
from .tokenizer import Tokenizer from .tokenizer import Tokenizer
from .vocab import Vocab from .vocab import Vocab
from .lemmatizer import Lemmatizer from .lemmatizer import Lemmatizer
from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer, EntityLinker from .pipeline import DependencyParser, Tagger
from .pipeline import Tensorizer, EntityRecognizer, EntityLinker
from .pipeline import SimilarityHook, TextCategorizer, Sentencizer from .pipeline import SimilarityHook, TextCategorizer, Sentencizer
from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
from .pipeline import EntityRuler from .pipeline import EntityRuler
@ -158,6 +159,9 @@ class Language(object):
vocab = factory(self, **meta.get("vocab", {})) vocab = factory(self, **meta.get("vocab", {}))
if vocab.vectors.name is None: if vocab.vectors.name is None:
vocab.vectors.name = meta.get("vectors", {}).get("name") vocab.vectors.name = meta.get("vectors", {}).get("name")
else:
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
self.vocab = vocab self.vocab = vocab
if make_doc is True: if make_doc is True:
factory = self.Defaults.create_tokenizer factory = self.Defaults.create_tokenizer
@ -173,7 +177,10 @@ class Language(object):
@property @property
def meta(self): def meta(self):
self._meta.setdefault("lang", self.vocab.lang) if self.vocab.lang:
self._meta.setdefault("lang", self.vocab.lang)
else:
self._meta.setdefault("lang", self.lang)
self._meta.setdefault("name", "model") self._meta.setdefault("name", "model")
self._meta.setdefault("version", "0.0.0") self._meta.setdefault("version", "0.0.0")
self._meta.setdefault("spacy_version", ">={}".format(about.__version__)) self._meta.setdefault("spacy_version", ">={}".format(about.__version__))
@ -618,7 +625,9 @@ class Language(object):
if component_cfg is None: if component_cfg is None:
component_cfg = {} component_cfg = {}
docs, golds = zip(*docs_golds) docs, golds = zip(*docs_golds)
docs = [self.make_doc(doc) if isinstance(doc, basestring_) else doc for doc in docs] docs = [
self.make_doc(doc) if isinstance(doc, basestring_) else doc for doc in docs
]
golds = list(golds) golds = list(golds)
for name, pipe in self.pipeline: for name, pipe in self.pipeline:
kwargs = component_cfg.get(name, {}) kwargs = component_cfg.get(name, {})
@ -769,8 +778,12 @@ class Language(object):
exclude = disable exclude = disable
path = util.ensure_path(path) path = util.ensure_path(path)
serializers = OrderedDict() serializers = OrderedDict()
serializers["tokenizer"] = lambda p: self.tokenizer.to_disk(p, exclude=["vocab"]) serializers["tokenizer"] = lambda p: self.tokenizer.to_disk(
serializers["meta.json"] = lambda p: p.open("w").write(srsly.json_dumps(self.meta)) p, exclude=["vocab"]
)
serializers["meta.json"] = lambda p: p.open("w").write(
srsly.json_dumps(self.meta)
)
for name, proc in self.pipeline: for name, proc in self.pipeline:
if not hasattr(proc, "name"): if not hasattr(proc, "name"):
continue continue
@ -799,14 +812,20 @@ class Language(object):
path = util.ensure_path(path) path = util.ensure_path(path)
deserializers = OrderedDict() deserializers = OrderedDict()
deserializers["meta.json"] = lambda p: self.meta.update(srsly.read_json(p)) deserializers["meta.json"] = lambda p: self.meta.update(srsly.read_json(p))
deserializers["vocab"] = lambda p: self.vocab.from_disk(p) and _fix_pretrained_vectors_name(self) deserializers["vocab"] = lambda p: self.vocab.from_disk(
deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(p, exclude=["vocab"]) p
) and _fix_pretrained_vectors_name(self)
deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(
p, exclude=["vocab"]
)
for name, proc in self.pipeline: for name, proc in self.pipeline:
if name in exclude: if name in exclude:
continue continue
if not hasattr(proc, "from_disk"): if not hasattr(proc, "from_disk"):
continue continue
deserializers[name] = lambda p, proc=proc: proc.from_disk(p, exclude=["vocab"]) deserializers[name] = lambda p, proc=proc: proc.from_disk(
p, exclude=["vocab"]
)
if not (path / "vocab").exists() and "vocab" not in exclude: if not (path / "vocab").exists() and "vocab" not in exclude:
# Convert to list here in case exclude is (default) tuple # Convert to list here in case exclude is (default) tuple
exclude = list(exclude) + ["vocab"] exclude = list(exclude) + ["vocab"]
@ -852,14 +871,20 @@ class Language(object):
exclude = disable exclude = disable
deserializers = OrderedDict() deserializers = OrderedDict()
deserializers["meta.json"] = lambda b: self.meta.update(srsly.json_loads(b)) deserializers["meta.json"] = lambda b: self.meta.update(srsly.json_loads(b))
deserializers["vocab"] = lambda b: self.vocab.from_bytes(b) and _fix_pretrained_vectors_name(self) deserializers["vocab"] = lambda b: self.vocab.from_bytes(
deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes(b, exclude=["vocab"]) b
) and _fix_pretrained_vectors_name(self)
deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes(
b, exclude=["vocab"]
)
for name, proc in self.pipeline: for name, proc in self.pipeline:
if name in exclude: if name in exclude:
continue continue
if not hasattr(proc, "from_bytes"): if not hasattr(proc, "from_bytes"):
continue continue
deserializers[name] = lambda b, proc=proc: proc.from_bytes(b, exclude=["vocab"]) deserializers[name] = lambda b, proc=proc: proc.from_bytes(
b, exclude=["vocab"]
)
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
util.from_bytes(bytes_data, deserializers, exclude) util.from_bytes(bytes_data, deserializers, exclude)
return self return self

View File

@ -66,8 +66,12 @@ class Pipe(object):
and `set_annotations()` methods. and `set_annotations()` methods.
""" """
self.require_model() self.require_model()
scores, tensors = self.predict([doc]) predictions = self.predict([doc])
self.set_annotations([doc], scores, tensors=tensors) if isinstance(predictions, tuple) and len(tuple) == 2:
scores, tensors = predictions
self.set_annotations([doc], scores, tensor=tensors)
else:
self.set_annotations([doc], predictions)
return doc return doc
def require_model(self): def require_model(self):

View File

@ -159,12 +159,19 @@ class Scorer(object):
else: else:
cand_deps.add((gold_i, gold_head, token.dep_.lower())) cand_deps.add((gold_i, gold_head, token.dep_.lower()))
if "-" not in [token[-1] for token in gold.orig_annot]: if "-" not in [token[-1] for token in gold.orig_annot]:
# Find all NER labels in gold and doc
ent_labels = set([x[0] for x in gold_ents]
+ [k.label_ for k in doc.ents])
# Set up all labels for per type scoring and prepare gold per type
gold_per_ents = {ent_label: set() for ent_label in ent_labels}
for ent_label in ent_labels:
if ent_label not in self.ner_per_ents:
self.ner_per_ents[ent_label] = PRFScore()
gold_per_ents[ent_label].update([x for x in gold_ents if x[0] == ent_label])
# Find all candidate labels, for all and per type
cand_ents = set() cand_ents = set()
current_ent = {k.label_: set() for k in doc.ents} cand_per_ents = {ent_label: set() for ent_label in ent_labels}
current_gold = {k.label_: set() for k in doc.ents}
for ent in doc.ents: for ent in doc.ents:
if ent.label_ not in self.ner_per_ents:
self.ner_per_ents[ent.label_] = PRFScore()
first = gold.cand_to_gold[ent.start] first = gold.cand_to_gold[ent.start]
last = gold.cand_to_gold[ent.end - 1] last = gold.cand_to_gold[ent.end - 1]
if first is None or last is None: if first is None or last is None:
@ -172,14 +179,11 @@ class Scorer(object):
self.ner_per_ents[ent.label_].fp += 1 self.ner_per_ents[ent.label_].fp += 1
else: else:
cand_ents.add((ent.label_, first, last)) cand_ents.add((ent.label_, first, last))
current_ent[ent.label_].update([x for x in cand_ents if x[0] == ent.label_]) cand_per_ents[ent.label_].add((ent.label_, first, last))
current_gold[ent.label_].update([x for x in gold_ents if x[0] == ent.label_])
# Scores per ent # Scores per ent
[ for k, v in self.ner_per_ents.items():
v.score_set(current_ent[k], current_gold[k]) if k in cand_per_ents:
for k, v in self.ner_per_ents.items() v.score_set(cand_per_ents[k], gold_per_ents[k])
if k in current_ent
]
# Score for all ents # Score for all ents
self.ner.score_set(cand_ents, gold_ents) self.ner.score_set(cand_ents, gold_ents)
self.tags.score_set(cand_tags, gold_tags) self.tags.score_set(cand_tags, gold_tags)

View File

@ -1,34 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from ..util import get_doc
test_samples = [
[
"100 - 200",
{
"entities": [
[0, 3, "CARDINAL"],
[6, 9, "CARDINAL"]
]
}
]
]
def test_issue3625(en_vocab):
scorer = Scorer()
for input_, annot in test_samples:
doc = get_doc(en_vocab, words = input_.split(' '), ents = [[0,1,'CARDINAL'], [2,3,'CARDINAL']]);
gold = GoldParse(doc, entities = annot['entities'])
scorer.score(doc, gold)
results = scorer.scores
# Expects total accuracy and accuracy for each each entity to be 100%
assert results['ents_p'] == 100
assert results['ents_f'] == 100
assert results['ents_r'] == 100
assert results['ents_per_type']['CARDINAL']['p'] == 100
assert results['ents_per_type']['CARDINAL']['f'] == 100
assert results['ents_per_type']['CARDINAL']['r'] == 100

View File

@ -0,0 +1,33 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.vocab import Vocab
import spacy
from spacy.lang.en import English
from spacy.tests.util import make_tempdir
from spacy.util import ensure_path
def test_issue4054(en_vocab):
"""Test that a new blank model can be made with a vocab from file,
and that serialization does not drop the language at any point."""
nlp1 = English()
vocab1 = nlp1.vocab
with make_tempdir() as d:
vocab_dir = ensure_path(d / "vocab")
if not vocab_dir.exists():
vocab_dir.mkdir()
vocab1.to_disk(vocab_dir)
vocab2 = Vocab().from_disk(vocab_dir)
print("lang", vocab2.lang)
nlp2 = spacy.blank("en", vocab=vocab2)
nlp_dir = ensure_path(d / "nlp")
if not nlp_dir.exists():
nlp_dir.mkdir()
nlp2.to_disk(nlp_dir)
nlp3 = spacy.load(nlp_dir)
assert nlp3.lang == "en"

View File

@ -0,0 +1,73 @@
# coding: utf-8
from __future__ import unicode_literals
from pytest import approx
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from .util import get_doc
test_ner_cardinal = [
[
"100 - 200",
{
"entities": [
[0, 3, "CARDINAL"],
[6, 9, "CARDINAL"]
]
}
]
]
test_ner_apple = [
[
"Apple is looking at buying U.K. startup for $1 billion",
{
"entities": [
(0, 5, "ORG"),
(27, 31, "GPE"),
(44, 54, "MONEY"),
]
}
]
]
def test_ner_per_type(en_vocab):
# Gold and Doc are identical
scorer = Scorer()
for input_, annot in test_ner_cardinal:
doc = get_doc(en_vocab, words = input_.split(' '), ents = [[0, 1, 'CARDINAL'], [2, 3, 'CARDINAL']])
gold = GoldParse(doc, entities = annot['entities'])
scorer.score(doc, gold)
results = scorer.scores
assert results['ents_p'] == 100
assert results['ents_f'] == 100
assert results['ents_r'] == 100
assert results['ents_per_type']['CARDINAL']['p'] == 100
assert results['ents_per_type']['CARDINAL']['f'] == 100
assert results['ents_per_type']['CARDINAL']['r'] == 100
# Doc has one missing and one extra entity
# Entity type MONEY is not present in Doc
scorer = Scorer()
for input_, annot in test_ner_apple:
doc = get_doc(en_vocab, words = input_.split(' '), ents = [[0, 1, 'ORG'], [5, 6, 'GPE'], [6, 7, 'ORG']])
gold = GoldParse(doc, entities = annot['entities'])
scorer.score(doc, gold)
results = scorer.scores
assert results['ents_p'] == approx(66.66666)
assert results['ents_r'] == approx(66.66666)
assert results['ents_f'] == approx(66.66666)
assert 'GPE' in results['ents_per_type']
assert 'MONEY' in results['ents_per_type']
assert 'ORG' in results['ents_per_type']
assert results['ents_per_type']['GPE']['p'] == 100
assert results['ents_per_type']['GPE']['r'] == 100
assert results['ents_per_type']['GPE']['f'] == 100
assert results['ents_per_type']['MONEY']['p'] == 0
assert results['ents_per_type']['MONEY']['r'] == 0
assert results['ents_per_type']['MONEY']['f'] == 0
assert results['ents_per_type']['ORG']['p'] == 50
assert results['ents_per_type']['ORG']['r'] == 100
assert results['ents_per_type']['ORG']['f'] == approx(66.66666)

View File

@ -471,6 +471,17 @@ cdef class Span:
self._vector_norm = xp.sqrt(total) if total != 0. else 0. self._vector_norm = xp.sqrt(total) if total != 0. else 0.
return self._vector_norm return self._vector_norm
@property
def tensor(self):
"""The span's slice of the doc's tensor.
RETURNS (ndarray[ndim=2, dtype='float32']): A 2D numpy or cupy array
representing the span's semantics.
"""
if self.doc.tensor is None:
return None
return self.doc.tensor[self.start : self.end]
@property @property
def sentiment(self): def sentiment(self):
"""RETURNS (float): A scalar value indicating the positivity or """RETURNS (float): A scalar value indicating the positivity or

View File

@ -408,6 +408,12 @@ cdef class Token:
total = (vector ** 2).sum() total = (vector ** 2).sum()
return xp.sqrt(total) if total != 0. else 0. return xp.sqrt(total) if total != 0. else 0.
@property
def tensor(self):
if self.doc.tensor is None:
return None
return self.doc.tensor[self.i]
@property @property
def n_lefts(self): def n_lefts(self):
"""The number of leftward immediate children of the word, in the """The number of leftward immediate children of the word, in the

View File

@ -14,7 +14,7 @@ import Icon from '../components/icon'
import Link from '../components/link' import Link from '../components/link'
import Grid from '../components/grid' import Grid from '../components/grid'
import Infobox from '../components/infobox' import Infobox from '../components/infobox'
import { join, arrayToObj, abbrNum } from '../components/util' import { join, arrayToObj, abbrNum, markdownToReact } from '../components/util'
const MODEL_META = { const MODEL_META = {
core: 'Vocabulary, syntax, entities, vectors', core: 'Vocabulary, syntax, entities, vectors',
@ -43,6 +43,10 @@ const MODEL_META = {
compat: 'Latest compatible model version for your spaCy installation', compat: 'Latest compatible model version for your spaCy installation',
} }
const MARKDOWN_COMPONENTS = {
code: InlineCode,
}
function getModelComponents(name) { function getModelComponents(name) {
const [lang, type, genre, size] = name.split('_') const [lang, type, genre, size] = name.split('_')
return { lang, type, genre, size } return { lang, type, genre, size }
@ -192,10 +196,8 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
python -m spacy download {name} python -m spacy download {name}
</CodeBlock> </CodeBlock>
</Aside> </Aside>
{meta.description && <p>{meta.description}</p>} {meta.description && markdownToReact(meta.description, MARKDOWN_COMPONENTS)}
{isError && error} {isError && error}
<Table> <Table>
<tbody> <tbody>
{rows.map(({ label, tag, help, content }, i) => {rows.map(({ label, tag, help, content }, i) =>
@ -243,7 +245,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
) )
)} )}
</Grid> </Grid>
{meta.notes && <p>{meta.notes}</p>} {meta.notes && markdownToReact(meta.notes, MARKDOWN_COMPONENTS)}
{hasInteractiveCode && ( {hasInteractiveCode && (
<CodeBlock title="Try out the model" lang="python" executable={true}> <CodeBlock title="Try out the model" lang="python" executable={true}>
{[ {[