Merge branch 'master' into spacy.io

This commit is contained in:
Ines Montani 2019-11-04 13:56:11 +01:00
commit d7a94edba6
17 changed files with 125 additions and 31 deletions

View File

@ -4,7 +4,7 @@ preshed>=3.0.2,<3.1.0
thinc>=7.3.0,<7.4.0 thinc>=7.3.0,<7.4.0
blis>=0.4.0,<0.5.0 blis>=0.4.0,<0.5.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
wasabi>=0.3.0,<1.1.0 wasabi>=0.4.0,<1.1.0
srsly>=0.1.0,<1.1.0 srsly>=0.1.0,<1.1.0
# Third party dependencies # Third party dependencies
numpy>=1.15.0 numpy>=1.15.0

View File

@ -40,17 +40,19 @@ setup_requires =
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
thinc>=7.3.0,<7.4.0 thinc>=7.3.0,<7.4.0
install_requires = install_requires =
setuptools # Our libraries
numpy>=1.15.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=7.3.0,<7.4.0 thinc>=7.3.0,<7.4.0
blis>=0.4.0,<0.5.0 blis>=0.4.0,<0.5.0
wasabi>=0.4.0,<1.1.0
srsly>=0.1.0,<1.1.0
# Third-party dependencies
setuptools
numpy>=1.15.0
plac>=0.9.6,<1.2.0 plac>=0.9.6,<1.2.0
requests>=2.13.0,<3.0.0 requests>=2.13.0,<3.0.0
wasabi>=0.3.0,<1.1.0
srsly>=0.1.0,<1.1.0
pathlib==1.0.1; python_version < "3.4" pathlib==1.0.1; python_version < "3.4"
importlib_metadata>=0.20; python_version < "3.8" importlib_metadata>=0.20; python_version < "3.8"

View File

@ -7,12 +7,10 @@ from __future__ import print_function
if __name__ == "__main__": if __name__ == "__main__":
import plac import plac
import sys import sys
from wasabi import Printer from wasabi import msg
from spacy.cli import download, link, info, package, train, pretrain, convert from spacy.cli import download, link, info, package, train, pretrain, convert
from spacy.cli import init_model, profile, evaluate, validate, debug_data from spacy.cli import init_model, profile, evaluate, validate, debug_data
msg = Printer()
commands = { commands = {
"download": download, "download": download,
"link": link, "link": link,

View File

@ -121,6 +121,8 @@ def debug_data(
msg.text("{} training docs".format(len(train_docs))) msg.text("{} training docs".format(len(train_docs)))
msg.text("{} evaluation docs".format(len(dev_docs))) msg.text("{} evaluation docs".format(len(dev_docs)))
if not len(dev_docs):
msg.fail("No evaluation docs")
overlap = len(train_texts.intersection(dev_texts)) overlap = len(train_texts.intersection(dev_texts))
if overlap: if overlap:
msg.warn("{} training examples also in evaluation data".format(overlap)) msg.warn("{} training examples also in evaluation data".format(overlap))

View File

@ -6,16 +6,13 @@ import requests
import os import os
import subprocess import subprocess
import sys import sys
from wasabi import Printer from wasabi import msg
from .link import link from .link import link
from ..util import get_package_path from ..util import get_package_path
from .. import about from .. import about
msg = Printer()
@plac.annotations( @plac.annotations(
model=("Model to download (shortcut or name)", "positional", None, str), model=("Model to download (shortcut or name)", "positional", None, str),
direct=("Force direct download of name + version", "flag", "d", bool), direct=("Force direct download of name + version", "flag", "d", bool),

View File

@ -3,7 +3,7 @@ from __future__ import unicode_literals, division, print_function
import plac import plac
from timeit import default_timer as timer from timeit import default_timer as timer
from wasabi import Printer from wasabi import msg
from ..gold import GoldCorpus from ..gold import GoldCorpus
from .. import util from .. import util
@ -32,7 +32,6 @@ def evaluate(
Evaluate a model. To render a sample of parses in a HTML file, set an Evaluate a model. To render a sample of parses in a HTML file, set an
output directory as the displacy_path argument. output directory as the displacy_path argument.
""" """
msg = Printer()
util.fix_random_seed() util.fix_random_seed()
if gpu_id >= 0: if gpu_id >= 0:
util.use_gpu(gpu_id) util.use_gpu(gpu_id)

View File

@ -4,7 +4,7 @@ from __future__ import unicode_literals
import plac import plac
import platform import platform
from pathlib import Path from pathlib import Path
from wasabi import Printer from wasabi import msg
import srsly import srsly
from ..compat import path2str, basestring_, unicode_ from ..compat import path2str, basestring_, unicode_
@ -23,7 +23,6 @@ def info(model=None, markdown=False, silent=False):
speficied as an argument, print model information. Flag --markdown speficied as an argument, print model information. Flag --markdown
prints details in Markdown for easy copy-pasting to GitHub issues. prints details in Markdown for easy copy-pasting to GitHub issues.
""" """
msg = Printer()
if model: if model:
if util.is_package(model): if util.is_package(model):
model_path = util.get_package_path(model) model_path = util.get_package_path(model)

View File

@ -11,7 +11,7 @@ import tarfile
import gzip import gzip
import zipfile import zipfile
import srsly import srsly
from wasabi import Printer from wasabi import msg
from ..vectors import Vectors from ..vectors import Vectors
from ..errors import Errors, Warnings, user_warning from ..errors import Errors, Warnings, user_warning
@ -24,7 +24,6 @@ except ImportError:
DEFAULT_OOV_PROB = -20 DEFAULT_OOV_PROB = -20
msg = Printer()
@plac.annotations( @plac.annotations(

View File

@ -3,7 +3,7 @@ from __future__ import unicode_literals
import plac import plac
from pathlib import Path from pathlib import Path
from wasabi import Printer from wasabi import msg
from ..compat import symlink_to, path2str from ..compat import symlink_to, path2str
from .. import util from .. import util
@ -20,7 +20,6 @@ def link(origin, link_name, force=False, model_path=None):
either the name of a pip package, or the local path to the model data either the name of a pip package, or the local path to the model data
directory. Linking models allows loading them via spacy.load(link_name). directory. Linking models allows loading them via spacy.load(link_name).
""" """
msg = Printer()
if util.is_package(origin): if util.is_package(origin):
model_path = util.get_package_path(origin) model_path = util.get_package_path(origin)
else: else:

View File

@ -4,7 +4,7 @@ from __future__ import unicode_literals
import plac import plac
import shutil import shutil
from pathlib import Path from pathlib import Path
from wasabi import Printer, get_raw_input from wasabi import msg, get_raw_input
import srsly import srsly
from ..compat import path2str from ..compat import path2str
@ -27,7 +27,6 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals
set and a meta.json already exists in the output directory, the existing set and a meta.json already exists in the output directory, the existing
values will be used as the defaults in the command-line prompt. values will be used as the defaults in the command-line prompt.
""" """
msg = Printer()
input_path = util.ensure_path(input_dir) input_path = util.ensure_path(input_dir)
output_path = util.ensure_path(output_dir) output_path = util.ensure_path(output_dir)
meta_path = util.ensure_path(meta_path) meta_path = util.ensure_path(meta_path)

View File

@ -11,7 +11,7 @@ from pathlib import Path
from thinc.v2v import Affine, Maxout from thinc.v2v import Affine, Maxout
from thinc.misc import LayerNorm as LN from thinc.misc import LayerNorm as LN
from thinc.neural.util import prefer_gpu from thinc.neural.util import prefer_gpu
from wasabi import Printer from wasabi import msg
import srsly import srsly
from ..errors import Errors from ..errors import Errors
@ -122,7 +122,6 @@ def pretrain(
for key in config: for key in config:
if isinstance(config[key], Path): if isinstance(config[key], Path):
config[key] = str(config[key]) config[key] = str(config[key])
msg = Printer()
util.fix_random_seed(seed) util.fix_random_seed(seed)
has_gpu = prefer_gpu() has_gpu = prefer_gpu()

View File

@ -9,7 +9,7 @@ import pstats
import sys import sys
import itertools import itertools
import thinc.extra.datasets import thinc.extra.datasets
from wasabi import Printer from wasabi import msg
from ..util import load_model from ..util import load_model
@ -26,7 +26,6 @@ def profile(model, inputs=None, n_texts=10000):
It can either be provided as a JSONL file, or be read from sys.sytdin. It can either be provided as a JSONL file, or be read from sys.sytdin.
If no input file is specified, the IMDB dataset is loaded via Thinc. If no input file is specified, the IMDB dataset is loaded via Thinc.
""" """
msg = Printer()
if inputs is not None: if inputs is not None:
inputs = _read_inputs(inputs, msg) inputs = _read_inputs(inputs, msg)
if inputs is None: if inputs is None:

View File

@ -8,7 +8,7 @@ from thinc.neural._classes.model import Model
from timeit import default_timer as timer from timeit import default_timer as timer
import shutil import shutil
import srsly import srsly
from wasabi import Printer from wasabi import msg
import contextlib import contextlib
import random import random
@ -89,7 +89,6 @@ def train(
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
import tqdm import tqdm
msg = Printer()
util.fix_random_seed() util.fix_random_seed()
util.set_env_log(verbose) util.set_env_log(verbose)

View File

@ -5,7 +5,7 @@ from pathlib import Path
import sys import sys
import requests import requests
import srsly import srsly
from wasabi import Printer from wasabi import msg
from ..compat import path2str from ..compat import path2str
from ..util import get_data_path from ..util import get_data_path
@ -17,7 +17,6 @@ def validate():
Validate that the currently installed version of spaCy is compatible Validate that the currently installed version of spaCy is compatible
with the installed models. Should be run after `pip install -U spacy`. with the installed models. Should be run after `pip install -U spacy`.
""" """
msg = Printer()
with msg.loading("Loading compatibility table..."): with msg.loading("Loading compatibility table..."):
r = requests.get(about.__compatibility__) r = requests.get(about.__compatibility__)
if r.status_code != 200: if r.status_code != 200:

View File

@ -82,6 +82,7 @@ class Scorer(object):
self.sbd = PRFScore() self.sbd = PRFScore()
self.unlabelled = PRFScore() self.unlabelled = PRFScore()
self.labelled = PRFScore() self.labelled = PRFScore()
self.labelled_per_dep = dict()
self.tags = PRFScore() self.tags = PRFScore()
self.ner = PRFScore() self.ner = PRFScore()
self.ner_per_ents = dict() self.ner_per_ents = dict()
@ -124,9 +125,18 @@ class Scorer(object):
@property @property
def las(self): def las(self):
"""RETURNS (float): Labelled depdendency score.""" """RETURNS (float): Labelled dependency score."""
return self.labelled.fscore * 100 return self.labelled.fscore * 100
@property
def las_per_type(self):
"""RETURNS (dict): Scores per dependency label.
"""
return {
k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
for k, v in self.labelled_per_dep.items()
}
@property @property
def ents_p(self): def ents_p(self):
"""RETURNS (float): Named entity accuracy (precision).""" """RETURNS (float): Named entity accuracy (precision)."""
@ -196,6 +206,7 @@ class Scorer(object):
return { return {
"uas": self.uas, "uas": self.uas,
"las": self.las, "las": self.las,
"las_per_type": self.las_per_type,
"ents_p": self.ents_p, "ents_p": self.ents_p,
"ents_r": self.ents_r, "ents_r": self.ents_r,
"ents_f": self.ents_f, "ents_f": self.ents_f,
@ -223,13 +234,20 @@ class Scorer(object):
doc, tuple(zip(*gold.orig_annot)) + (gold.cats,) doc, tuple(zip(*gold.orig_annot)) + (gold.cats,)
) )
gold_deps = set() gold_deps = set()
gold_deps_per_dep = {}
gold_tags = set() gold_tags = set()
gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot])) gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot]))
for id_, word, tag, head, dep, ner in gold.orig_annot: for id_, word, tag, head, dep, ner in gold.orig_annot:
gold_tags.add((id_, tag)) gold_tags.add((id_, tag))
if dep not in (None, "") and dep.lower() not in punct_labels: if dep not in (None, "") and dep.lower() not in punct_labels:
gold_deps.add((id_, head, dep.lower())) gold_deps.add((id_, head, dep.lower()))
if dep.lower() not in self.labelled_per_dep:
self.labelled_per_dep[dep.lower()] = PRFScore()
if dep.lower() not in gold_deps_per_dep:
gold_deps_per_dep[dep.lower()] = set()
gold_deps_per_dep[dep.lower()].add((id_, head, dep.lower()))
cand_deps = set() cand_deps = set()
cand_deps_per_dep = {}
cand_tags = set() cand_tags = set()
for token in doc: for token in doc:
if token.orth_.isspace(): if token.orth_.isspace():
@ -249,6 +267,11 @@ class Scorer(object):
self.labelled.fp += 1 self.labelled.fp += 1
else: else:
cand_deps.add((gold_i, gold_head, token.dep_.lower())) cand_deps.add((gold_i, gold_head, token.dep_.lower()))
if token.dep_.lower() not in self.labelled_per_dep:
self.labelled_per_dep[token.dep_.lower()] = PRFScore()
if token.dep_.lower() not in cand_deps_per_dep:
cand_deps_per_dep[token.dep_.lower()] = set()
cand_deps_per_dep[token.dep_.lower()].add((gold_i, gold_head, token.dep_.lower()))
if "-" not in [token[-1] for token in gold.orig_annot]: if "-" not in [token[-1] for token in gold.orig_annot]:
# Find all NER labels in gold and doc # Find all NER labels in gold and doc
ent_labels = set([x[0] for x in gold_ents] + [k.label_ for k in doc.ents]) ent_labels = set([x[0] for x in gold_ents] + [k.label_ for k in doc.ents])
@ -280,6 +303,8 @@ class Scorer(object):
self.ner.score_set(cand_ents, gold_ents) self.ner.score_set(cand_ents, gold_ents)
self.tags.score_set(cand_tags, gold_tags) self.tags.score_set(cand_tags, gold_tags)
self.labelled.score_set(cand_deps, gold_deps) self.labelled.score_set(cand_deps, gold_deps)
for dep in self.labelled_per_dep:
self.labelled_per_dep[dep].score_set(cand_deps_per_dep.get(dep, set()), gold_deps_per_dep.get(dep, set()))
self.unlabelled.score_set( self.unlabelled.score_set(
set(item[:2] for item in cand_deps), set(item[:2] for item in gold_deps) set(item[:2] for item in cand_deps), set(item[:2] for item in gold_deps)
) )

View File

@ -9,6 +9,14 @@ from spacy.scorer import Scorer, ROCAUCScore
from spacy.scorer import _roc_auc_score, _roc_curve from spacy.scorer import _roc_auc_score, _roc_curve
from .util import get_doc from .util import get_doc
test_las_apple = [
[
"Apple is looking at buying U.K. startup for $ 1 billion",
{"heads": [2, 2, 2, 2, 3, 6, 4, 4, 10, 10, 7],
"deps": ['nsubj', 'aux', 'ROOT', 'prep', 'pcomp', 'compound', 'dobj', 'prep', 'quantmod', 'compound', 'pobj']},
]
]
test_ner_cardinal = [ test_ner_cardinal = [
["100 - 200", {"entities": [[0, 3, "CARDINAL"], [6, 9, "CARDINAL"]]}] ["100 - 200", {"entities": [[0, 3, "CARDINAL"], [6, 9, "CARDINAL"]]}]
] ]
@ -21,6 +29,53 @@ test_ner_apple = [
] ]
def test_las_per_type(en_vocab):
# Gold and Doc are identical
scorer = Scorer()
for input_, annot in test_las_apple:
doc = get_doc(
en_vocab,
words=input_.split(" "),
heads=([h - i for i, h in enumerate(annot["heads"])]),
deps=annot["deps"],
)
gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"])
scorer.score(doc, gold)
results = scorer.scores
assert results["uas"] == 100
assert results["las"] == 100
assert results["las_per_type"]["nsubj"]["p"] == 100
assert results["las_per_type"]["nsubj"]["r"] == 100
assert results["las_per_type"]["nsubj"]["f"] == 100
assert results["las_per_type"]["compound"]["p"] == 100
assert results["las_per_type"]["compound"]["r"] == 100
assert results["las_per_type"]["compound"]["f"] == 100
# One dep is incorrect in Doc
scorer = Scorer()
for input_, annot in test_las_apple:
doc = get_doc(
en_vocab,
words=input_.split(" "),
heads=([h - i for i, h in enumerate(annot["heads"])]),
deps=annot["deps"]
)
gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"])
doc[0].dep_ = "compound"
scorer.score(doc, gold)
results = scorer.scores
assert results["uas"] == 100
assert_almost_equal(results["las"], 90.9090909)
assert results["las_per_type"]["nsubj"]["p"] == 0
assert results["las_per_type"]["nsubj"]["r"] == 0
assert results["las_per_type"]["nsubj"]["f"] == 0
assert_almost_equal(results["las_per_type"]["compound"]["p"], 66.6666666)
assert results["las_per_type"]["compound"]["r"] == 100
assert results["las_per_type"]["compound"]["f"] == 80
def test_ner_per_type(en_vocab): def test_ner_per_type(en_vocab):
# Gold and Doc are identical # Gold and Doc are identical
scorer = Scorer() scorer = Scorer()

View File

@ -1861,6 +1861,30 @@
"author_links": { "author_links": {
"github": "microsoft" "github": "microsoft"
} }
},
{
"id": "dframcy",
"title": "Dframcy",
"slogan": "Dataframe Integration with spaCy NLP",
"github": "yash1994/dframcy",
"description": "DframCy is a light-weight utility module to integrate Pandas Dataframe to spaCy's linguistic annotation and training tasks.",
"pip": "dframcy",
"category": ["pipeline", "training"],
"tags": ["pandas"],
"code_example": [
"import spacy",
"from dframcy import DframCy",
"",
"nlp = spacy.load('en_core_web_sm')",
"dframcy = DframCy(nlp)",
"doc = dframcy.nlp(u'Apple is looking at buying U.K. startup for $1 billion')",
"annotation_dataframe = dframcy.to_dataframe(doc)"
],
"author": "Yash Patadia",
"author_links": {
"twitter": "PatadiaYash",
"github": "yash1994"
}
} }
], ],