diff --git a/requirements.txt b/requirements.txt index ad7059f3a..89118b970 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ preshed>=3.0.2,<3.1.0 thinc>=7.3.0,<7.4.0 blis>=0.4.0,<0.5.0 murmurhash>=0.28.0,<1.1.0 -wasabi>=0.3.0,<1.1.0 +wasabi>=0.4.0,<1.1.0 srsly>=0.1.0,<1.1.0 # Third party dependencies numpy>=1.15.0 diff --git a/setup.cfg b/setup.cfg index 51e722354..60a24dc58 100644 --- a/setup.cfg +++ b/setup.cfg @@ -40,17 +40,19 @@ setup_requires = murmurhash>=0.28.0,<1.1.0 thinc>=7.3.0,<7.4.0 install_requires = - setuptools - numpy>=1.15.0 + # Our libraries murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 thinc>=7.3.0,<7.4.0 blis>=0.4.0,<0.5.0 + wasabi>=0.4.0,<1.1.0 + srsly>=0.1.0,<1.1.0 + # Third-party dependencies + setuptools + numpy>=1.15.0 plac>=0.9.6,<1.2.0 requests>=2.13.0,<3.0.0 - wasabi>=0.3.0,<1.1.0 - srsly>=0.1.0,<1.1.0 pathlib==1.0.1; python_version < "3.4" importlib_metadata>=0.20; python_version < "3.8" diff --git a/spacy/__main__.py b/spacy/__main__.py index 716561566..2c285095e 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -7,12 +7,10 @@ from __future__ import print_function if __name__ == "__main__": import plac import sys - from wasabi import Printer + from wasabi import msg from spacy.cli import download, link, info, package, train, pretrain, convert from spacy.cli import init_model, profile, evaluate, validate, debug_data - msg = Printer() - commands = { "download": download, "link": link, diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 8161ddf45..5d044e617 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -121,6 +121,8 @@ def debug_data( msg.text("{} training docs".format(len(train_docs))) msg.text("{} evaluation docs".format(len(dev_docs))) + if not len(dev_docs): + msg.fail("No evaluation docs") overlap = len(train_texts.intersection(dev_texts)) if overlap: msg.warn("{} training examples also in evaluation data".format(overlap)) diff --git a/spacy/cli/download.py b/spacy/cli/download.py index c57e2364b..19f3e7860 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -6,16 +6,13 @@ import requests import os import subprocess import sys -from wasabi import Printer +from wasabi import msg from .link import link from ..util import get_package_path from .. import about -msg = Printer() - - @plac.annotations( model=("Model to download (shortcut or name)", "positional", None, str), direct=("Force direct download of name + version", "flag", "d", bool), diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 1114ada08..c24e37038 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals, division, print_function import plac from timeit import default_timer as timer -from wasabi import Printer +from wasabi import msg from ..gold import GoldCorpus from .. import util @@ -32,7 +32,6 @@ def evaluate( Evaluate a model. To render a sample of parses in a HTML file, set an output directory as the displacy_path argument. """ - msg = Printer() util.fix_random_seed() if gpu_id >= 0: util.use_gpu(gpu_id) diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 3655327ef..080d0dc77 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import plac import platform from pathlib import Path -from wasabi import Printer +from wasabi import msg import srsly from ..compat import path2str, basestring_, unicode_ @@ -23,7 +23,6 @@ def info(model=None, markdown=False, silent=False): speficied as an argument, print model information. Flag --markdown prints details in Markdown for easy copy-pasting to GitHub issues. """ - msg = Printer() if model: if util.is_package(model): model_path = util.get_package_path(model) diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index c285a12a6..cda21cbcc 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -11,7 +11,7 @@ import tarfile import gzip import zipfile import srsly -from wasabi import Printer +from wasabi import msg from ..vectors import Vectors from ..errors import Errors, Warnings, user_warning @@ -24,7 +24,6 @@ except ImportError: DEFAULT_OOV_PROB = -20 -msg = Printer() @plac.annotations( diff --git a/spacy/cli/link.py b/spacy/cli/link.py index 6b719ffe6..8117829b5 100644 --- a/spacy/cli/link.py +++ b/spacy/cli/link.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import plac from pathlib import Path -from wasabi import Printer +from wasabi import msg from ..compat import symlink_to, path2str from .. import util @@ -20,7 +20,6 @@ def link(origin, link_name, force=False, model_path=None): either the name of a pip package, or the local path to the model data directory. Linking models allows loading them via spacy.load(link_name). """ - msg = Printer() if util.is_package(origin): model_path = util.get_package_path(origin) else: diff --git a/spacy/cli/package.py b/spacy/cli/package.py index e99a6d5ff..8ed92259c 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import plac import shutil from pathlib import Path -from wasabi import Printer, get_raw_input +from wasabi import msg, get_raw_input import srsly from ..compat import path2str @@ -27,7 +27,6 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals set and a meta.json already exists in the output directory, the existing values will be used as the defaults in the command-line prompt. """ - msg = Printer() input_path = util.ensure_path(input_dir) output_path = util.ensure_path(output_dir) meta_path = util.ensure_path(meta_path) diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index f7236f7de..c1aade2b2 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -11,7 +11,7 @@ from pathlib import Path from thinc.v2v import Affine, Maxout from thinc.misc import LayerNorm as LN from thinc.neural.util import prefer_gpu -from wasabi import Printer +from wasabi import msg import srsly from ..errors import Errors @@ -122,7 +122,6 @@ def pretrain( for key in config: if isinstance(config[key], Path): config[key] = str(config[key]) - msg = Printer() util.fix_random_seed(seed) has_gpu = prefer_gpu() diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py index 201ab13d5..4995224f3 100644 --- a/spacy/cli/profile.py +++ b/spacy/cli/profile.py @@ -9,7 +9,7 @@ import pstats import sys import itertools import thinc.extra.datasets -from wasabi import Printer +from wasabi import msg from ..util import load_model @@ -26,7 +26,6 @@ def profile(model, inputs=None, n_texts=10000): It can either be provided as a JSONL file, or be read from sys.sytdin. If no input file is specified, the IMDB dataset is loaded via Thinc. """ - msg = Printer() if inputs is not None: inputs = _read_inputs(inputs, msg) if inputs is None: diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 13fcae37f..d1fbdd179 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -8,7 +8,7 @@ from thinc.neural._classes.model import Model from timeit import default_timer as timer import shutil import srsly -from wasabi import Printer +from wasabi import msg import contextlib import random @@ -89,7 +89,6 @@ def train( # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 import tqdm - msg = Printer() util.fix_random_seed() util.set_env_log(verbose) diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py index 38f8d2313..93abad6f6 100644 --- a/spacy/cli/validate.py +++ b/spacy/cli/validate.py @@ -5,7 +5,7 @@ from pathlib import Path import sys import requests import srsly -from wasabi import Printer +from wasabi import msg from ..compat import path2str from ..util import get_data_path @@ -17,7 +17,6 @@ def validate(): Validate that the currently installed version of spaCy is compatible with the installed models. Should be run after `pip install -U spacy`. """ - msg = Printer() with msg.loading("Loading compatibility table..."): r = requests.get(about.__compatibility__) if r.status_code != 200: diff --git a/spacy/scorer.py b/spacy/scorer.py index e8dfe8b04..0b4843f41 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -82,6 +82,7 @@ class Scorer(object): self.sbd = PRFScore() self.unlabelled = PRFScore() self.labelled = PRFScore() + self.labelled_per_dep = dict() self.tags = PRFScore() self.ner = PRFScore() self.ner_per_ents = dict() @@ -124,9 +125,18 @@ class Scorer(object): @property def las(self): - """RETURNS (float): Labelled depdendency score.""" + """RETURNS (float): Labelled dependency score.""" return self.labelled.fscore * 100 + @property + def las_per_type(self): + """RETURNS (dict): Scores per dependency label. + """ + return { + k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100} + for k, v in self.labelled_per_dep.items() + } + @property def ents_p(self): """RETURNS (float): Named entity accuracy (precision).""" @@ -196,6 +206,7 @@ class Scorer(object): return { "uas": self.uas, "las": self.las, + "las_per_type": self.las_per_type, "ents_p": self.ents_p, "ents_r": self.ents_r, "ents_f": self.ents_f, @@ -223,13 +234,20 @@ class Scorer(object): doc, tuple(zip(*gold.orig_annot)) + (gold.cats,) ) gold_deps = set() + gold_deps_per_dep = {} gold_tags = set() gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot])) for id_, word, tag, head, dep, ner in gold.orig_annot: gold_tags.add((id_, tag)) if dep not in (None, "") and dep.lower() not in punct_labels: gold_deps.add((id_, head, dep.lower())) + if dep.lower() not in self.labelled_per_dep: + self.labelled_per_dep[dep.lower()] = PRFScore() + if dep.lower() not in gold_deps_per_dep: + gold_deps_per_dep[dep.lower()] = set() + gold_deps_per_dep[dep.lower()].add((id_, head, dep.lower())) cand_deps = set() + cand_deps_per_dep = {} cand_tags = set() for token in doc: if token.orth_.isspace(): @@ -249,6 +267,11 @@ class Scorer(object): self.labelled.fp += 1 else: cand_deps.add((gold_i, gold_head, token.dep_.lower())) + if token.dep_.lower() not in self.labelled_per_dep: + self.labelled_per_dep[token.dep_.lower()] = PRFScore() + if token.dep_.lower() not in cand_deps_per_dep: + cand_deps_per_dep[token.dep_.lower()] = set() + cand_deps_per_dep[token.dep_.lower()].add((gold_i, gold_head, token.dep_.lower())) if "-" not in [token[-1] for token in gold.orig_annot]: # Find all NER labels in gold and doc ent_labels = set([x[0] for x in gold_ents] + [k.label_ for k in doc.ents]) @@ -280,6 +303,8 @@ class Scorer(object): self.ner.score_set(cand_ents, gold_ents) self.tags.score_set(cand_tags, gold_tags) self.labelled.score_set(cand_deps, gold_deps) + for dep in self.labelled_per_dep: + self.labelled_per_dep[dep].score_set(cand_deps_per_dep.get(dep, set()), gold_deps_per_dep.get(dep, set())) self.unlabelled.score_set( set(item[:2] for item in cand_deps), set(item[:2] for item in gold_deps) ) diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index 9cc4f75b2..c59358a6b 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -9,6 +9,14 @@ from spacy.scorer import Scorer, ROCAUCScore from spacy.scorer import _roc_auc_score, _roc_curve from .util import get_doc +test_las_apple = [ + [ + "Apple is looking at buying U.K. startup for $ 1 billion", + {"heads": [2, 2, 2, 2, 3, 6, 4, 4, 10, 10, 7], + "deps": ['nsubj', 'aux', 'ROOT', 'prep', 'pcomp', 'compound', 'dobj', 'prep', 'quantmod', 'compound', 'pobj']}, + ] +] + test_ner_cardinal = [ ["100 - 200", {"entities": [[0, 3, "CARDINAL"], [6, 9, "CARDINAL"]]}] ] @@ -21,6 +29,53 @@ test_ner_apple = [ ] +def test_las_per_type(en_vocab): + # Gold and Doc are identical + scorer = Scorer() + for input_, annot in test_las_apple: + doc = get_doc( + en_vocab, + words=input_.split(" "), + heads=([h - i for i, h in enumerate(annot["heads"])]), + deps=annot["deps"], + ) + gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"]) + scorer.score(doc, gold) + results = scorer.scores + + assert results["uas"] == 100 + assert results["las"] == 100 + assert results["las_per_type"]["nsubj"]["p"] == 100 + assert results["las_per_type"]["nsubj"]["r"] == 100 + assert results["las_per_type"]["nsubj"]["f"] == 100 + assert results["las_per_type"]["compound"]["p"] == 100 + assert results["las_per_type"]["compound"]["r"] == 100 + assert results["las_per_type"]["compound"]["f"] == 100 + + # One dep is incorrect in Doc + scorer = Scorer() + for input_, annot in test_las_apple: + doc = get_doc( + en_vocab, + words=input_.split(" "), + heads=([h - i for i, h in enumerate(annot["heads"])]), + deps=annot["deps"] + ) + gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"]) + doc[0].dep_ = "compound" + scorer.score(doc, gold) + results = scorer.scores + + assert results["uas"] == 100 + assert_almost_equal(results["las"], 90.9090909) + assert results["las_per_type"]["nsubj"]["p"] == 0 + assert results["las_per_type"]["nsubj"]["r"] == 0 + assert results["las_per_type"]["nsubj"]["f"] == 0 + assert_almost_equal(results["las_per_type"]["compound"]["p"], 66.6666666) + assert results["las_per_type"]["compound"]["r"] == 100 + assert results["las_per_type"]["compound"]["f"] == 80 + + def test_ner_per_type(en_vocab): # Gold and Doc are identical scorer = Scorer() diff --git a/website/meta/universe.json b/website/meta/universe.json index 749abc659..40ebfaaa7 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1861,6 +1861,30 @@ "author_links": { "github": "microsoft" } + }, + { + "id": "dframcy", + "title": "Dframcy", + "slogan": "Dataframe Integration with spaCy NLP", + "github": "yash1994/dframcy", + "description": "DframCy is a light-weight utility module to integrate Pandas Dataframe to spaCy's linguistic annotation and training tasks.", + "pip": "dframcy", + "category": ["pipeline", "training"], + "tags": ["pandas"], + "code_example": [ + "import spacy", + "from dframcy import DframCy", + "", + "nlp = spacy.load('en_core_web_sm')", + "dframcy = DframCy(nlp)", + "doc = dframcy.nlp(u'Apple is looking at buying U.K. startup for $1 billion')", + "annotation_dataframe = dframcy.to_dataframe(doc)" + ], + "author": "Yash Patadia", + "author_links": { + "twitter": "PatadiaYash", + "github": "yash1994" + } } ],