Merge branch 'master' into spacy.io

This commit is contained in:
Ines Montani 2019-11-04 13:56:11 +01:00
commit d7a94edba6
17 changed files with 125 additions and 31 deletions

View File

@ -4,7 +4,7 @@ preshed>=3.0.2,<3.1.0
thinc>=7.3.0,<7.4.0
blis>=0.4.0,<0.5.0
murmurhash>=0.28.0,<1.1.0
wasabi>=0.3.0,<1.1.0
wasabi>=0.4.0,<1.1.0
srsly>=0.1.0,<1.1.0
# Third party dependencies
numpy>=1.15.0

View File

@ -40,17 +40,19 @@ setup_requires =
murmurhash>=0.28.0,<1.1.0
thinc>=7.3.0,<7.4.0
install_requires =
setuptools
numpy>=1.15.0
# Our libraries
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=7.3.0,<7.4.0
blis>=0.4.0,<0.5.0
wasabi>=0.4.0,<1.1.0
srsly>=0.1.0,<1.1.0
# Third-party dependencies
setuptools
numpy>=1.15.0
plac>=0.9.6,<1.2.0
requests>=2.13.0,<3.0.0
wasabi>=0.3.0,<1.1.0
srsly>=0.1.0,<1.1.0
pathlib==1.0.1; python_version < "3.4"
importlib_metadata>=0.20; python_version < "3.8"

View File

@ -7,12 +7,10 @@ from __future__ import print_function
if __name__ == "__main__":
import plac
import sys
from wasabi import Printer
from wasabi import msg
from spacy.cli import download, link, info, package, train, pretrain, convert
from spacy.cli import init_model, profile, evaluate, validate, debug_data
msg = Printer()
commands = {
"download": download,
"link": link,

View File

@ -121,6 +121,8 @@ def debug_data(
msg.text("{} training docs".format(len(train_docs)))
msg.text("{} evaluation docs".format(len(dev_docs)))
if not len(dev_docs):
msg.fail("No evaluation docs")
overlap = len(train_texts.intersection(dev_texts))
if overlap:
msg.warn("{} training examples also in evaluation data".format(overlap))

View File

@ -6,16 +6,13 @@ import requests
import os
import subprocess
import sys
from wasabi import Printer
from wasabi import msg
from .link import link
from ..util import get_package_path
from .. import about
msg = Printer()
@plac.annotations(
model=("Model to download (shortcut or name)", "positional", None, str),
direct=("Force direct download of name + version", "flag", "d", bool),

View File

@ -3,7 +3,7 @@ from __future__ import unicode_literals, division, print_function
import plac
from timeit import default_timer as timer
from wasabi import Printer
from wasabi import msg
from ..gold import GoldCorpus
from .. import util
@ -32,7 +32,6 @@ def evaluate(
Evaluate a model. To render a sample of parses in a HTML file, set an
output directory as the displacy_path argument.
"""
msg = Printer()
util.fix_random_seed()
if gpu_id >= 0:
util.use_gpu(gpu_id)

View File

@ -4,7 +4,7 @@ from __future__ import unicode_literals
import plac
import platform
from pathlib import Path
from wasabi import Printer
from wasabi import msg
import srsly
from ..compat import path2str, basestring_, unicode_
@ -23,7 +23,6 @@ def info(model=None, markdown=False, silent=False):
speficied as an argument, print model information. Flag --markdown
prints details in Markdown for easy copy-pasting to GitHub issues.
"""
msg = Printer()
if model:
if util.is_package(model):
model_path = util.get_package_path(model)

View File

@ -11,7 +11,7 @@ import tarfile
import gzip
import zipfile
import srsly
from wasabi import Printer
from wasabi import msg
from ..vectors import Vectors
from ..errors import Errors, Warnings, user_warning
@ -24,7 +24,6 @@ except ImportError:
DEFAULT_OOV_PROB = -20
msg = Printer()
@plac.annotations(

View File

@ -3,7 +3,7 @@ from __future__ import unicode_literals
import plac
from pathlib import Path
from wasabi import Printer
from wasabi import msg
from ..compat import symlink_to, path2str
from .. import util
@ -20,7 +20,6 @@ def link(origin, link_name, force=False, model_path=None):
either the name of a pip package, or the local path to the model data
directory. Linking models allows loading them via spacy.load(link_name).
"""
msg = Printer()
if util.is_package(origin):
model_path = util.get_package_path(origin)
else:

View File

@ -4,7 +4,7 @@ from __future__ import unicode_literals
import plac
import shutil
from pathlib import Path
from wasabi import Printer, get_raw_input
from wasabi import msg, get_raw_input
import srsly
from ..compat import path2str
@ -27,7 +27,6 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals
set and a meta.json already exists in the output directory, the existing
values will be used as the defaults in the command-line prompt.
"""
msg = Printer()
input_path = util.ensure_path(input_dir)
output_path = util.ensure_path(output_dir)
meta_path = util.ensure_path(meta_path)

View File

@ -11,7 +11,7 @@ from pathlib import Path
from thinc.v2v import Affine, Maxout
from thinc.misc import LayerNorm as LN
from thinc.neural.util import prefer_gpu
from wasabi import Printer
from wasabi import msg
import srsly
from ..errors import Errors
@ -122,7 +122,6 @@ def pretrain(
for key in config:
if isinstance(config[key], Path):
config[key] = str(config[key])
msg = Printer()
util.fix_random_seed(seed)
has_gpu = prefer_gpu()

View File

@ -9,7 +9,7 @@ import pstats
import sys
import itertools
import thinc.extra.datasets
from wasabi import Printer
from wasabi import msg
from ..util import load_model
@ -26,7 +26,6 @@ def profile(model, inputs=None, n_texts=10000):
It can either be provided as a JSONL file, or be read from sys.sytdin.
If no input file is specified, the IMDB dataset is loaded via Thinc.
"""
msg = Printer()
if inputs is not None:
inputs = _read_inputs(inputs, msg)
if inputs is None:

View File

@ -8,7 +8,7 @@ from thinc.neural._classes.model import Model
from timeit import default_timer as timer
import shutil
import srsly
from wasabi import Printer
from wasabi import msg
import contextlib
import random
@ -89,7 +89,6 @@ def train(
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
import tqdm
msg = Printer()
util.fix_random_seed()
util.set_env_log(verbose)

View File

@ -5,7 +5,7 @@ from pathlib import Path
import sys
import requests
import srsly
from wasabi import Printer
from wasabi import msg
from ..compat import path2str
from ..util import get_data_path
@ -17,7 +17,6 @@ def validate():
Validate that the currently installed version of spaCy is compatible
with the installed models. Should be run after `pip install -U spacy`.
"""
msg = Printer()
with msg.loading("Loading compatibility table..."):
r = requests.get(about.__compatibility__)
if r.status_code != 200:

View File

@ -82,6 +82,7 @@ class Scorer(object):
self.sbd = PRFScore()
self.unlabelled = PRFScore()
self.labelled = PRFScore()
self.labelled_per_dep = dict()
self.tags = PRFScore()
self.ner = PRFScore()
self.ner_per_ents = dict()
@ -124,9 +125,18 @@ class Scorer(object):
@property
def las(self):
"""RETURNS (float): Labelled depdendency score."""
"""RETURNS (float): Labelled dependency score."""
return self.labelled.fscore * 100
@property
def las_per_type(self):
"""RETURNS (dict): Scores per dependency label.
"""
return {
k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
for k, v in self.labelled_per_dep.items()
}
@property
def ents_p(self):
"""RETURNS (float): Named entity accuracy (precision)."""
@ -196,6 +206,7 @@ class Scorer(object):
return {
"uas": self.uas,
"las": self.las,
"las_per_type": self.las_per_type,
"ents_p": self.ents_p,
"ents_r": self.ents_r,
"ents_f": self.ents_f,
@ -223,13 +234,20 @@ class Scorer(object):
doc, tuple(zip(*gold.orig_annot)) + (gold.cats,)
)
gold_deps = set()
gold_deps_per_dep = {}
gold_tags = set()
gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot]))
for id_, word, tag, head, dep, ner in gold.orig_annot:
gold_tags.add((id_, tag))
if dep not in (None, "") and dep.lower() not in punct_labels:
gold_deps.add((id_, head, dep.lower()))
if dep.lower() not in self.labelled_per_dep:
self.labelled_per_dep[dep.lower()] = PRFScore()
if dep.lower() not in gold_deps_per_dep:
gold_deps_per_dep[dep.lower()] = set()
gold_deps_per_dep[dep.lower()].add((id_, head, dep.lower()))
cand_deps = set()
cand_deps_per_dep = {}
cand_tags = set()
for token in doc:
if token.orth_.isspace():
@ -249,6 +267,11 @@ class Scorer(object):
self.labelled.fp += 1
else:
cand_deps.add((gold_i, gold_head, token.dep_.lower()))
if token.dep_.lower() not in self.labelled_per_dep:
self.labelled_per_dep[token.dep_.lower()] = PRFScore()
if token.dep_.lower() not in cand_deps_per_dep:
cand_deps_per_dep[token.dep_.lower()] = set()
cand_deps_per_dep[token.dep_.lower()].add((gold_i, gold_head, token.dep_.lower()))
if "-" not in [token[-1] for token in gold.orig_annot]:
# Find all NER labels in gold and doc
ent_labels = set([x[0] for x in gold_ents] + [k.label_ for k in doc.ents])
@ -280,6 +303,8 @@ class Scorer(object):
self.ner.score_set(cand_ents, gold_ents)
self.tags.score_set(cand_tags, gold_tags)
self.labelled.score_set(cand_deps, gold_deps)
for dep in self.labelled_per_dep:
self.labelled_per_dep[dep].score_set(cand_deps_per_dep.get(dep, set()), gold_deps_per_dep.get(dep, set()))
self.unlabelled.score_set(
set(item[:2] for item in cand_deps), set(item[:2] for item in gold_deps)
)

View File

@ -9,6 +9,14 @@ from spacy.scorer import Scorer, ROCAUCScore
from spacy.scorer import _roc_auc_score, _roc_curve
from .util import get_doc
test_las_apple = [
[
"Apple is looking at buying U.K. startup for $ 1 billion",
{"heads": [2, 2, 2, 2, 3, 6, 4, 4, 10, 10, 7],
"deps": ['nsubj', 'aux', 'ROOT', 'prep', 'pcomp', 'compound', 'dobj', 'prep', 'quantmod', 'compound', 'pobj']},
]
]
test_ner_cardinal = [
["100 - 200", {"entities": [[0, 3, "CARDINAL"], [6, 9, "CARDINAL"]]}]
]
@ -21,6 +29,53 @@ test_ner_apple = [
]
def test_las_per_type(en_vocab):
# Gold and Doc are identical
scorer = Scorer()
for input_, annot in test_las_apple:
doc = get_doc(
en_vocab,
words=input_.split(" "),
heads=([h - i for i, h in enumerate(annot["heads"])]),
deps=annot["deps"],
)
gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"])
scorer.score(doc, gold)
results = scorer.scores
assert results["uas"] == 100
assert results["las"] == 100
assert results["las_per_type"]["nsubj"]["p"] == 100
assert results["las_per_type"]["nsubj"]["r"] == 100
assert results["las_per_type"]["nsubj"]["f"] == 100
assert results["las_per_type"]["compound"]["p"] == 100
assert results["las_per_type"]["compound"]["r"] == 100
assert results["las_per_type"]["compound"]["f"] == 100
# One dep is incorrect in Doc
scorer = Scorer()
for input_, annot in test_las_apple:
doc = get_doc(
en_vocab,
words=input_.split(" "),
heads=([h - i for i, h in enumerate(annot["heads"])]),
deps=annot["deps"]
)
gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"])
doc[0].dep_ = "compound"
scorer.score(doc, gold)
results = scorer.scores
assert results["uas"] == 100
assert_almost_equal(results["las"], 90.9090909)
assert results["las_per_type"]["nsubj"]["p"] == 0
assert results["las_per_type"]["nsubj"]["r"] == 0
assert results["las_per_type"]["nsubj"]["f"] == 0
assert_almost_equal(results["las_per_type"]["compound"]["p"], 66.6666666)
assert results["las_per_type"]["compound"]["r"] == 100
assert results["las_per_type"]["compound"]["f"] == 80
def test_ner_per_type(en_vocab):
# Gold and Doc are identical
scorer = Scorer()

View File

@ -1861,6 +1861,30 @@
"author_links": {
"github": "microsoft"
}
},
{
"id": "dframcy",
"title": "Dframcy",
"slogan": "Dataframe Integration with spaCy NLP",
"github": "yash1994/dframcy",
"description": "DframCy is a light-weight utility module to integrate Pandas Dataframe to spaCy's linguistic annotation and training tasks.",
"pip": "dframcy",
"category": ["pipeline", "training"],
"tags": ["pandas"],
"code_example": [
"import spacy",
"from dframcy import DframCy",
"",
"nlp = spacy.load('en_core_web_sm')",
"dframcy = DframCy(nlp)",
"doc = dframcy.nlp(u'Apple is looking at buying U.K. startup for $1 billion')",
"annotation_dataframe = dframcy.to_dataframe(doc)"
],
"author": "Yash Patadia",
"author_links": {
"twitter": "PatadiaYash",
"github": "yash1994"
}
}
],