From f7d950de6df12e729c9beb25ee25ea3dac01afaf Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 1 Aug 2019 17:13:01 +0200 Subject: [PATCH 1/8] ensure the lang of vocab and nlp stay consistent (#4057) * ensure the language of vocab and nlp stay consistent across serialization * equality with = --- spacy/errors.py | 2 + spacy/language.py | 47 ++++++++++++++++++------ spacy/tests/regression/test_issue4054.py | 33 +++++++++++++++++ 3 files changed, 71 insertions(+), 11 deletions(-) create mode 100644 spacy/tests/regression/test_issue4054.py diff --git a/spacy/errors.py b/spacy/errors.py index 1699809a7..945d3364a 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -415,6 +415,8 @@ class Errors(object): "is assigned to a KB identifier.") E149 = ("Error deserializing model. Check that the config used to create the " "component matches the model being loaded.") + E150 = ("The language of the `nlp` object and the `vocab` should be the same, " + "but found '{nlp}' and '{vocab}' respectively.") @add_codes class TempErrors(object): diff --git a/spacy/language.py b/spacy/language.py index bfdd00b79..b839be1f6 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -14,7 +14,8 @@ import srsly from .tokenizer import Tokenizer from .vocab import Vocab from .lemmatizer import Lemmatizer -from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer, EntityLinker +from .pipeline import DependencyParser, Tagger +from .pipeline import Tensorizer, EntityRecognizer, EntityLinker from .pipeline import SimilarityHook, TextCategorizer, Sentencizer from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens from .pipeline import EntityRuler @@ -158,6 +159,9 @@ class Language(object): vocab = factory(self, **meta.get("vocab", {})) if vocab.vectors.name is None: vocab.vectors.name = meta.get("vectors", {}).get("name") + else: + if (self.lang and vocab.lang) and (self.lang != vocab.lang): + raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang)) self.vocab = vocab if make_doc is True: factory = self.Defaults.create_tokenizer @@ -173,7 +177,10 @@ class Language(object): @property def meta(self): - self._meta.setdefault("lang", self.vocab.lang) + if self.vocab.lang: + self._meta.setdefault("lang", self.vocab.lang) + else: + self._meta.setdefault("lang", self.lang) self._meta.setdefault("name", "model") self._meta.setdefault("version", "0.0.0") self._meta.setdefault("spacy_version", ">={}".format(about.__version__)) @@ -618,7 +625,9 @@ class Language(object): if component_cfg is None: component_cfg = {} docs, golds = zip(*docs_golds) - docs = [self.make_doc(doc) if isinstance(doc, basestring_) else doc for doc in docs] + docs = [ + self.make_doc(doc) if isinstance(doc, basestring_) else doc for doc in docs + ] golds = list(golds) for name, pipe in self.pipeline: kwargs = component_cfg.get(name, {}) @@ -769,8 +778,12 @@ class Language(object): exclude = disable path = util.ensure_path(path) serializers = OrderedDict() - serializers["tokenizer"] = lambda p: self.tokenizer.to_disk(p, exclude=["vocab"]) - serializers["meta.json"] = lambda p: p.open("w").write(srsly.json_dumps(self.meta)) + serializers["tokenizer"] = lambda p: self.tokenizer.to_disk( + p, exclude=["vocab"] + ) + serializers["meta.json"] = lambda p: p.open("w").write( + srsly.json_dumps(self.meta) + ) for name, proc in self.pipeline: if not hasattr(proc, "name"): continue @@ -799,14 +812,20 @@ class Language(object): path = util.ensure_path(path) deserializers = OrderedDict() deserializers["meta.json"] = lambda p: self.meta.update(srsly.read_json(p)) - deserializers["vocab"] = lambda p: self.vocab.from_disk(p) and _fix_pretrained_vectors_name(self) - deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(p, exclude=["vocab"]) + deserializers["vocab"] = lambda p: self.vocab.from_disk( + p + ) and _fix_pretrained_vectors_name(self) + deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk( + p, exclude=["vocab"] + ) for name, proc in self.pipeline: if name in exclude: continue if not hasattr(proc, "from_disk"): continue - deserializers[name] = lambda p, proc=proc: proc.from_disk(p, exclude=["vocab"]) + deserializers[name] = lambda p, proc=proc: proc.from_disk( + p, exclude=["vocab"] + ) if not (path / "vocab").exists() and "vocab" not in exclude: # Convert to list here in case exclude is (default) tuple exclude = list(exclude) + ["vocab"] @@ -852,14 +871,20 @@ class Language(object): exclude = disable deserializers = OrderedDict() deserializers["meta.json"] = lambda b: self.meta.update(srsly.json_loads(b)) - deserializers["vocab"] = lambda b: self.vocab.from_bytes(b) and _fix_pretrained_vectors_name(self) - deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes(b, exclude=["vocab"]) + deserializers["vocab"] = lambda b: self.vocab.from_bytes( + b + ) and _fix_pretrained_vectors_name(self) + deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes( + b, exclude=["vocab"] + ) for name, proc in self.pipeline: if name in exclude: continue if not hasattr(proc, "from_bytes"): continue - deserializers[name] = lambda b, proc=proc: proc.from_bytes(b, exclude=["vocab"]) + deserializers[name] = lambda b, proc=proc: proc.from_bytes( + b, exclude=["vocab"] + ) exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) util.from_bytes(bytes_data, deserializers, exclude) return self diff --git a/spacy/tests/regression/test_issue4054.py b/spacy/tests/regression/test_issue4054.py new file mode 100644 index 000000000..2c9d73751 --- /dev/null +++ b/spacy/tests/regression/test_issue4054.py @@ -0,0 +1,33 @@ +# coding: utf8 +from __future__ import unicode_literals + +from spacy.vocab import Vocab + +import spacy +from spacy.lang.en import English +from spacy.tests.util import make_tempdir +from spacy.util import ensure_path + + +def test_issue4054(en_vocab): + """Test that a new blank model can be made with a vocab from file, + and that serialization does not drop the language at any point.""" + nlp1 = English() + vocab1 = nlp1.vocab + + with make_tempdir() as d: + vocab_dir = ensure_path(d / "vocab") + if not vocab_dir.exists(): + vocab_dir.mkdir() + vocab1.to_disk(vocab_dir) + + vocab2 = Vocab().from_disk(vocab_dir) + print("lang", vocab2.lang) + nlp2 = spacy.blank("en", vocab=vocab2) + + nlp_dir = ensure_path(d / "nlp") + if not nlp_dir.exists(): + nlp_dir.mkdir() + nlp2.to_disk(nlp_dir) + nlp3 = spacy.load(nlp_dir) + assert nlp3.lang == "en" From 925a852bb6450e16a23346e97a1813fc0fcb22a0 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Thu, 1 Aug 2019 17:15:36 +0200 Subject: [PATCH 2/8] Improve NER per type scoring (#4052) * Improve NER per type scoring * include all gold labels in per type scoring, not only when recall > 0 * improve efficiency of per type scoring * Create Scorer tests, initially with NER tests * move regression test #3968 (per type NER scoring) to Scorer tests * add new test for per type NER scoring with imperfect P/R/F and per type P/R/F including a case where R == 0.0 --- spacy/scorer.py | 26 +++++---- spacy/tests/regression/test_issue3968.py | 34 ----------- spacy/tests/test_scorer.py | 73 ++++++++++++++++++++++++ 3 files changed, 88 insertions(+), 45 deletions(-) delete mode 100644 spacy/tests/regression/test_issue3968.py create mode 100644 spacy/tests/test_scorer.py diff --git a/spacy/scorer.py b/spacy/scorer.py index 34a9b7620..1362e9b4d 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -159,12 +159,19 @@ class Scorer(object): else: cand_deps.add((gold_i, gold_head, token.dep_.lower())) if "-" not in [token[-1] for token in gold.orig_annot]: + # Find all NER labels in gold and doc + ent_labels = set([x[0] for x in gold_ents] + + [k.label_ for k in doc.ents]) + # Set up all labels for per type scoring and prepare gold per type + gold_per_ents = {ent_label: set() for ent_label in ent_labels} + for ent_label in ent_labels: + if ent_label not in self.ner_per_ents: + self.ner_per_ents[ent_label] = PRFScore() + gold_per_ents[ent_label].update([x for x in gold_ents if x[0] == ent_label]) + # Find all candidate labels, for all and per type cand_ents = set() - current_ent = {k.label_: set() for k in doc.ents} - current_gold = {k.label_: set() for k in doc.ents} + cand_per_ents = {ent_label: set() for ent_label in ent_labels} for ent in doc.ents: - if ent.label_ not in self.ner_per_ents: - self.ner_per_ents[ent.label_] = PRFScore() first = gold.cand_to_gold[ent.start] last = gold.cand_to_gold[ent.end - 1] if first is None or last is None: @@ -172,14 +179,11 @@ class Scorer(object): self.ner_per_ents[ent.label_].fp += 1 else: cand_ents.add((ent.label_, first, last)) - current_ent[ent.label_].update([x for x in cand_ents if x[0] == ent.label_]) - current_gold[ent.label_].update([x for x in gold_ents if x[0] == ent.label_]) + cand_per_ents[ent.label_].add((ent.label_, first, last)) # Scores per ent - [ - v.score_set(current_ent[k], current_gold[k]) - for k, v in self.ner_per_ents.items() - if k in current_ent - ] + for k, v in self.ner_per_ents.items(): + if k in cand_per_ents: + v.score_set(cand_per_ents[k], gold_per_ents[k]) # Score for all ents self.ner.score_set(cand_ents, gold_ents) self.tags.score_set(cand_tags, gold_tags) diff --git a/spacy/tests/regression/test_issue3968.py b/spacy/tests/regression/test_issue3968.py deleted file mode 100644 index 7e970a3a9..000000000 --- a/spacy/tests/regression/test_issue3968.py +++ /dev/null @@ -1,34 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from spacy.gold import GoldParse -from spacy.scorer import Scorer -from ..util import get_doc - -test_samples = [ - [ - "100 - 200", - { - "entities": [ - [0, 3, "CARDINAL"], - [6, 9, "CARDINAL"] - ] - } - ] -] - -def test_issue3625(en_vocab): - scorer = Scorer() - for input_, annot in test_samples: - doc = get_doc(en_vocab, words = input_.split(' '), ents = [[0,1,'CARDINAL'], [2,3,'CARDINAL']]); - gold = GoldParse(doc, entities = annot['entities']) - scorer.score(doc, gold) - results = scorer.scores - - # Expects total accuracy and accuracy for each each entity to be 100% - assert results['ents_p'] == 100 - assert results['ents_f'] == 100 - assert results['ents_r'] == 100 - assert results['ents_per_type']['CARDINAL']['p'] == 100 - assert results['ents_per_type']['CARDINAL']['f'] == 100 - assert results['ents_per_type']['CARDINAL']['r'] == 100 diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py new file mode 100644 index 000000000..a88aef368 --- /dev/null +++ b/spacy/tests/test_scorer.py @@ -0,0 +1,73 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from pytest import approx +from spacy.gold import GoldParse +from spacy.scorer import Scorer +from .util import get_doc + +test_ner_cardinal = [ + [ + "100 - 200", + { + "entities": [ + [0, 3, "CARDINAL"], + [6, 9, "CARDINAL"] + ] + } + ] +] + +test_ner_apple = [ + [ + "Apple is looking at buying U.K. startup for $1 billion", + { + "entities": [ + (0, 5, "ORG"), + (27, 31, "GPE"), + (44, 54, "MONEY"), + ] + } + ] +] + +def test_ner_per_type(en_vocab): + # Gold and Doc are identical + scorer = Scorer() + for input_, annot in test_ner_cardinal: + doc = get_doc(en_vocab, words = input_.split(' '), ents = [[0, 1, 'CARDINAL'], [2, 3, 'CARDINAL']]) + gold = GoldParse(doc, entities = annot['entities']) + scorer.score(doc, gold) + results = scorer.scores + + assert results['ents_p'] == 100 + assert results['ents_f'] == 100 + assert results['ents_r'] == 100 + assert results['ents_per_type']['CARDINAL']['p'] == 100 + assert results['ents_per_type']['CARDINAL']['f'] == 100 + assert results['ents_per_type']['CARDINAL']['r'] == 100 + + # Doc has one missing and one extra entity + # Entity type MONEY is not present in Doc + scorer = Scorer() + for input_, annot in test_ner_apple: + doc = get_doc(en_vocab, words = input_.split(' '), ents = [[0, 1, 'ORG'], [5, 6, 'GPE'], [6, 7, 'ORG']]) + gold = GoldParse(doc, entities = annot['entities']) + scorer.score(doc, gold) + results = scorer.scores + + assert results['ents_p'] == approx(66.66666) + assert results['ents_r'] == approx(66.66666) + assert results['ents_f'] == approx(66.66666) + assert 'GPE' in results['ents_per_type'] + assert 'MONEY' in results['ents_per_type'] + assert 'ORG' in results['ents_per_type'] + assert results['ents_per_type']['GPE']['p'] == 100 + assert results['ents_per_type']['GPE']['r'] == 100 + assert results['ents_per_type']['GPE']['f'] == 100 + assert results['ents_per_type']['MONEY']['p'] == 0 + assert results['ents_per_type']['MONEY']['r'] == 0 + assert results['ents_per_type']['MONEY']['f'] == 0 + assert results['ents_per_type']['ORG']['p'] == 50 + assert results['ents_per_type']['ORG']['r'] == 100 + assert results['ents_per_type']['ORG']['f'] == approx(66.66666) From 8718ca8b1f173ebf1d710b1a463226526a8f9d8d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 1 Aug 2019 17:26:09 +0200 Subject: [PATCH 3/8] Fix init_model if there's no vocab (closes #4048) (#4049) --- spacy/cli/init_model.py | 45 +++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 6626b52e4..f3b60e7fa 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -24,6 +24,7 @@ except ImportError: ftfy = None +DEFAULT_OOV_PROB = -20 msg = Printer() @@ -108,23 +109,30 @@ def open_file(loc): def read_attrs_from_deprecated(freqs_loc, clusters_loc): - with msg.loading("Counting frequencies..."): - probs, oov_prob = read_freqs(freqs_loc) if freqs_loc is not None else ({}, -20) - msg.good("Counted frequencies") - with msg.loading("Reading clusters..."): - clusters = read_clusters(clusters_loc) if clusters_loc else {} - msg.good("Read clusters") + if freqs_loc is not None: + with msg.loading("Counting frequencies..."): + probs, _ = read_freqs(freqs_loc) + msg.good("Counted frequencies") + else: + probs, _ = ({}, DEFAULT_OOV_PROB) + if clusters_loc: + with msg.loading("Reading clusters..."): + clusters = read_clusters(clusters_loc) + msg.good("Read clusters") + else: + clusters = {} lex_attrs = [] sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True) - for i, (word, prob) in tqdm(enumerate(sorted_probs)): - attrs = {"orth": word, "id": i, "prob": prob} - # Decode as a little-endian string, so that we can do & 15 to get - # the first 4 bits. See _parse_features.pyx - if word in clusters: - attrs["cluster"] = int(clusters[word][::-1], 2) - else: - attrs["cluster"] = 0 - lex_attrs.append(attrs) + if len(sorted_probs): + for i, (word, prob) in tqdm(enumerate(sorted_probs)): + attrs = {"orth": word, "id": i, "prob": prob} + # Decode as a little-endian string, so that we can do & 15 to get + # the first 4 bits. See _parse_features.pyx + if word in clusters: + attrs["cluster"] = int(clusters[word][::-1], 2) + else: + attrs["cluster"] = 0 + lex_attrs.append(attrs) return lex_attrs @@ -142,8 +150,11 @@ def create_model(lang, lex_attrs): lexeme.is_oov = False lex_added += 1 lex_added += 1 - oov_prob = min(lex.prob for lex in nlp.vocab) - nlp.vocab.cfg.update({"oov_prob": oov_prob - 1}) + if len(nlp.vocab): + oov_prob = min(lex.prob for lex in nlp.vocab) - 1 + else: + oov_prob = DEFAULT_OOV_PROB + nlp.vocab.cfg.update({"oov_prob": oov_prob}) return nlp From 4632c597e7d1d175077a7bf147cf9ad201ef04e5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 1 Aug 2019 17:29:01 +0200 Subject: [PATCH 4/8] Fix Pipe base class --- spacy/pipeline/pipes.pyx | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index ba1fca24e..375a0884b 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -66,8 +66,12 @@ class Pipe(object): and `set_annotations()` methods. """ self.require_model() - scores, tensors = self.predict([doc]) - self.set_annotations([doc], scores, tensors=tensors) + predictions = self.predict([doc]) + if isinstance(predictions, tuple) and len(tuple) == 2: + scores, tensors = predictions + self.set_annotations([doc], scores, tensor=tensors) + else: + self.set_annotations([doc], predictions) return doc def require_model(self): From 97c51ef93bbad7bb63be3c5280ce2e48af6db513 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 1 Aug 2019 17:29:25 +0200 Subject: [PATCH 5/8] Set version to v2.1.7.dev1 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 1b786a82a..4ec537d4b 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -4,7 +4,7 @@ # fmt: off __title__ = "spacy" -__version__ = "2.1.7.dev0" +__version__ = "2.1.7.dev1" __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython" __uri__ = "https://spacy.io" __author__ = "Explosion AI" From d3071ecdbc769c823636a4c64a4ee5b7c3ffc76c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 1 Aug 2019 18:09:19 +0200 Subject: [PATCH 6/8] Set version to v2.1.7 --- spacy/about.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/about.py b/spacy/about.py index 4ec537d4b..ae65922c4 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -4,13 +4,13 @@ # fmt: off __title__ = "spacy" -__version__ = "2.1.7.dev1" +__version__ = "2.1.7" __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython" __uri__ = "https://spacy.io" __author__ = "Explosion AI" __email__ = "contact@explosion.ai" __license__ = "MIT" -__release__ = False +__release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From 944a66c326f18a3fcf3cd76798ecdc2405e1f026 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 1 Aug 2019 18:30:50 +0200 Subject: [PATCH 7/8] Add span.tensor and token.tensor attributes --- spacy/tokens/span.pyx | 11 +++++++++++ spacy/tokens/token.pyx | 6 ++++++ 2 files changed, 17 insertions(+) diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 460972369..f702133af 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -471,6 +471,17 @@ cdef class Span: self._vector_norm = xp.sqrt(total) if total != 0. else 0. return self._vector_norm + @property + def tensor(self): + """The span's slice of the doc's tensor. + + RETURNS (ndarray[ndim=2, dtype='float32']): A 2D numpy or cupy array + representing the span's semantics. + """ + if self.doc.tensor is None: + return None + return self.doc.tensor[self.start : self.end] + @property def sentiment(self): """RETURNS (float): A scalar value indicating the positivity or diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 909ebecbb..07c6f1c99 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -408,6 +408,12 @@ cdef class Token: total = (vector ** 2).sum() return xp.sqrt(total) if total != 0. else 0. + @property + def tensor(self): + if self.doc.tensor is None: + return None + return self.doc.tensor[self.i] + @property def n_lefts(self): """The number of leftward immediate children of the word, in the From 3072eb28c201dd0242e1aa6ffca2fff8eddf274f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 1 Aug 2019 18:33:10 +0200 Subject: [PATCH 8/8] Support and render Markdown in model meta [ci skip] --- website/src/templates/models.js | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/website/src/templates/models.js b/website/src/templates/models.js index b9cd81c97..4713f4b34 100644 --- a/website/src/templates/models.js +++ b/website/src/templates/models.js @@ -14,7 +14,7 @@ import Icon from '../components/icon' import Link from '../components/link' import Grid from '../components/grid' import Infobox from '../components/infobox' -import { join, arrayToObj, abbrNum } from '../components/util' +import { join, arrayToObj, abbrNum, markdownToReact } from '../components/util' const MODEL_META = { core: 'Vocabulary, syntax, entities, vectors', @@ -43,6 +43,10 @@ const MODEL_META = { compat: 'Latest compatible model version for your spaCy installation', } +const MARKDOWN_COMPONENTS = { + code: InlineCode, +} + function getModelComponents(name) { const [lang, type, genre, size] = name.split('_') return { lang, type, genre, size } @@ -192,10 +196,8 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl python -m spacy download {name} - {meta.description &&

{meta.description}

} - + {meta.description && markdownToReact(meta.description, MARKDOWN_COMPONENTS)} {isError && error} - {rows.map(({ label, tag, help, content }, i) => @@ -243,7 +245,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl ) )} - {meta.notes &&

{meta.notes}

} + {meta.notes && markdownToReact(meta.notes, MARKDOWN_COMPONENTS)} {hasInteractiveCode && ( {[