Merge branch 'master' into spacy.io

2025-11-09 20:38:06 +03:00 · 2019-11-04 13:56:11 +01:00 · 2019-11-04 13:56:11 +01:00 · d7a94edba6
commit d7a94edba6
parent 07ba9b4aa2 4b95587ad4
17 changed files with 125 additions and 31 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -4,7 +4,7 @@ preshed>=3.0.2,<3.1.0
 thinc>=7.3.0,<7.4.0
 blis>=0.4.0,<0.5.0
 murmurhash>=0.28.0,<1.1.0
-wasabi>=0.3.0,<1.1.0
+wasabi>=0.4.0,<1.1.0
 srsly>=0.1.0,<1.1.0
 # Third party dependencies
 numpy>=1.15.0
--- a/setup.cfg
+++ b/setup.cfg
@ -40,17 +40,19 @@ setup_requires =
    murmurhash>=0.28.0,<1.1.0
    thinc>=7.3.0,<7.4.0
 install_requires =
-    setuptools
-    numpy>=1.15.0
+    # Our libraries
    murmurhash>=0.28.0,<1.1.0
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
    thinc>=7.3.0,<7.4.0
    blis>=0.4.0,<0.5.0
+    wasabi>=0.4.0,<1.1.0
+    srsly>=0.1.0,<1.1.0
+    # Third-party dependencies
+    setuptools
+    numpy>=1.15.0
    plac>=0.9.6,<1.2.0
    requests>=2.13.0,<3.0.0
-    wasabi>=0.3.0,<1.1.0
-    srsly>=0.1.0,<1.1.0
    pathlib==1.0.1; python_version < "3.4"
    importlib_metadata>=0.20; python_version < "3.8"

--- a/spacy/main.py
+++ b/spacy/main.py
@ -7,12 +7,10 @@ from __future__ import print_function
 if __name__ == "__main__":
    import plac
    import sys
-    from wasabi import Printer
+    from wasabi import msg
    from spacy.cli import download, link, info, package, train, pretrain, convert
    from spacy.cli import init_model, profile, evaluate, validate, debug_data

-    msg = Printer()
-
    commands = {
        "download": download,
        "link": link,
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -121,6 +121,8 @@ def debug_data(
    msg.text("{} training docs".format(len(train_docs)))
    msg.text("{} evaluation docs".format(len(dev_docs)))

+    if not len(dev_docs):
+        msg.fail("No evaluation docs")
    overlap = len(train_texts.intersection(dev_texts))
    if overlap:
        msg.warn("{} training examples also in evaluation data".format(overlap))
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -6,16 +6,13 @@ import requests
 import os
 import subprocess
 import sys
-from wasabi import Printer
+from wasabi import msg

 from .link import link
 from ..util import get_package_path
 from .. import about


-msg = Printer()
-
-
@plac.annotations(
    model=("Model to download (shortcut or name)", "positional", None, str),
    direct=("Force direct download of name + version", "flag", "d", bool),
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@ -3,7 +3,7 @@ from __future__ import unicode_literals, division, print_function

 import plac
 from timeit import default_timer as timer
-from wasabi import Printer
+from wasabi import msg

 from ..gold import GoldCorpus
 from .. import util
@ -32,7 +32,6 @@ def evaluate(
    Evaluate a model. To render a sample of parses in a HTML file, set an
    output directory as the displacy_path argument.
    """
-    msg = Printer()
    util.fix_random_seed()
    if gpu_id >= 0:
        util.use_gpu(gpu_id)
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@ -4,7 +4,7 @@ from __future__ import unicode_literals
 import plac
 import platform
 from pathlib import Path
-from wasabi import Printer
+from wasabi import msg
 import srsly

 from ..compat import path2str, basestring_, unicode_
@ -23,7 +23,6 @@ def info(model=None, markdown=False, silent=False):
    speficied as an argument, print model information. Flag --markdown
    prints details in Markdown for easy copy-pasting to GitHub issues.
    """
-    msg = Printer()
    if model:
        if util.is_package(model):
            model_path = util.get_package_path(model)
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@ -11,7 +11,7 @@ import tarfile
 import gzip
 import zipfile
 import srsly
-from wasabi import Printer
+from wasabi import msg

 from ..vectors import Vectors
 from ..errors import Errors, Warnings, user_warning
@ -24,7 +24,6 @@ except ImportError:


 DEFAULT_OOV_PROB = -20
-msg = Printer()


@plac.annotations(
--- a/spacy/cli/link.py
+++ b/spacy/cli/link.py
@ -3,7 +3,7 @@ from __future__ import unicode_literals

 import plac
 from pathlib import Path
-from wasabi import Printer
+from wasabi import msg

 from ..compat import symlink_to, path2str
 from .. import util
@ -20,7 +20,6 @@ def link(origin, link_name, force=False, model_path=None):
    either the name of a pip package, or the local path to the model data
    directory. Linking models allows loading them via spacy.load(link_name).
    """
-    msg = Printer()
    if util.is_package(origin):
        model_path = util.get_package_path(origin)
    else:
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -4,7 +4,7 @@ from __future__ import unicode_literals
 import plac
 import shutil
 from pathlib import Path
-from wasabi import Printer, get_raw_input
+from wasabi import msg, get_raw_input
 import srsly

 from ..compat import path2str
@ -27,7 +27,6 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals
    set and a meta.json already exists in the output directory, the existing
    values will be used as the defaults in the command-line prompt.
    """
-    msg = Printer()
    input_path = util.ensure_path(input_dir)
    output_path = util.ensure_path(output_dir)
    meta_path = util.ensure_path(meta_path)
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -11,7 +11,7 @@ from pathlib import Path
 from thinc.v2v import Affine, Maxout
 from thinc.misc import LayerNorm as LN
 from thinc.neural.util import prefer_gpu
-from wasabi import Printer
+from wasabi import msg
 import srsly

 from ..errors import Errors
@ -122,7 +122,6 @@ def pretrain(
    for key in config:
        if isinstance(config[key], Path):
            config[key] = str(config[key])
-    msg = Printer()
    util.fix_random_seed(seed)

    has_gpu = prefer_gpu()
--- a/spacy/cli/profile.py
+++ b/spacy/cli/profile.py
@ -9,7 +9,7 @@ import pstats
 import sys
 import itertools
 import thinc.extra.datasets
-from wasabi import Printer
+from wasabi import msg

 from ..util import load_model

@ -26,7 +26,6 @@ def profile(model, inputs=None, n_texts=10000):
    It can either be provided as a JSONL file, or be read from sys.sytdin.
    If no input file is specified, the IMDB dataset is loaded via Thinc.
    """
-    msg = Printer()
    if inputs is not None:
        inputs = _read_inputs(inputs, msg)
    if inputs is None:
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -8,7 +8,7 @@ from thinc.neural._classes.model import Model
 from timeit import default_timer as timer
 import shutil
 import srsly
-from wasabi import Printer
+from wasabi import msg
 import contextlib
 import random

@ -89,7 +89,6 @@ def train(
    # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
    import tqdm

-    msg = Printer()
    util.fix_random_seed()
    util.set_env_log(verbose)

--- a/spacy/cli/validate.py
+++ b/spacy/cli/validate.py
@ -5,7 +5,7 @@ from pathlib import Path
 import sys
 import requests
 import srsly
-from wasabi import Printer
+from wasabi import msg

 from ..compat import path2str
 from ..util import get_data_path
@ -17,7 +17,6 @@ def validate():
    Validate that the currently installed version of spaCy is compatible
    with the installed models. Should be run after `pip install -U spacy`.
    """
-    msg = Printer()
    with msg.loading("Loading compatibility table..."):
        r = requests.get(about.__compatibility__)
        if r.status_code != 200:
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -82,6 +82,7 @@ class Scorer(object):
        self.sbd = PRFScore()
        self.unlabelled = PRFScore()
        self.labelled = PRFScore()
+        self.labelled_per_dep = dict()
        self.tags = PRFScore()
        self.ner = PRFScore()
        self.ner_per_ents = dict()
@ -124,9 +125,18 @@ class Scorer(object):

    @property
    def las(self):
-        """RETURNS (float): Labelled depdendency score."""
+        """RETURNS (float): Labelled dependency score."""
        return self.labelled.fscore * 100

+    @property
+    def las_per_type(self):
+        """RETURNS (dict): Scores per dependency label.
+        """
+        return {
+            k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
+            for k, v in self.labelled_per_dep.items()
+        }
+
    @property
    def ents_p(self):
        """RETURNS (float): Named entity accuracy (precision)."""
@ -196,6 +206,7 @@ class Scorer(object):
        return {
            "uas": self.uas,
            "las": self.las,
+            "las_per_type": self.las_per_type,
            "ents_p": self.ents_p,
            "ents_r": self.ents_r,
            "ents_f": self.ents_f,
@ -223,13 +234,20 @@ class Scorer(object):
                doc, tuple(zip(*gold.orig_annot)) + (gold.cats,)
            )
        gold_deps = set()
+        gold_deps_per_dep = {}
        gold_tags = set()
        gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot]))
        for id_, word, tag, head, dep, ner in gold.orig_annot:
            gold_tags.add((id_, tag))
            if dep not in (None, "") and dep.lower() not in punct_labels:
                gold_deps.add((id_, head, dep.lower()))
+                if dep.lower() not in self.labelled_per_dep:
+                    self.labelled_per_dep[dep.lower()] = PRFScore()
+                if dep.lower() not in gold_deps_per_dep:
+                    gold_deps_per_dep[dep.lower()] = set()
+                gold_deps_per_dep[dep.lower()].add((id_, head, dep.lower()))
        cand_deps = set()
+        cand_deps_per_dep = {}
        cand_tags = set()
        for token in doc:
            if token.orth_.isspace():
@ -249,6 +267,11 @@ class Scorer(object):
                    self.labelled.fp += 1
                else:
                    cand_deps.add((gold_i, gold_head, token.dep_.lower()))
+                    if token.dep_.lower() not in self.labelled_per_dep:
+                        self.labelled_per_dep[token.dep_.lower()] = PRFScore()
+                    if token.dep_.lower() not in cand_deps_per_dep:
+                        cand_deps_per_dep[token.dep_.lower()] = set()
+                    cand_deps_per_dep[token.dep_.lower()].add((gold_i, gold_head, token.dep_.lower()))
        if "-" not in [token[-1] for token in gold.orig_annot]:
            # Find all NER labels in gold and doc
            ent_labels = set([x[0] for x in gold_ents] + [k.label_ for k in doc.ents])
@ -280,6 +303,8 @@ class Scorer(object):
            self.ner.score_set(cand_ents, gold_ents)
        self.tags.score_set(cand_tags, gold_tags)
        self.labelled.score_set(cand_deps, gold_deps)
+        for dep in self.labelled_per_dep:
+            self.labelled_per_dep[dep].score_set(cand_deps_per_dep.get(dep, set()), gold_deps_per_dep.get(dep, set()))
        self.unlabelled.score_set(
            set(item[:2] for item in cand_deps), set(item[:2] for item in gold_deps)
        )
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@ -9,6 +9,14 @@ from spacy.scorer import Scorer, ROCAUCScore
 from spacy.scorer import _roc_auc_score, _roc_curve
 from .util import get_doc

+test_las_apple = [
+    [
+        "Apple is looking at buying U.K. startup for $ 1 billion",
+        {"heads": [2, 2, 2, 2, 3, 6, 4, 4, 10, 10, 7],
+         "deps": ['nsubj', 'aux', 'ROOT', 'prep', 'pcomp', 'compound', 'dobj', 'prep', 'quantmod', 'compound', 'pobj']},
+    ]
+]
+
 test_ner_cardinal = [
    ["100 - 200", {"entities": [[0, 3, "CARDINAL"], [6, 9, "CARDINAL"]]}]
 ]
@ -21,6 +29,53 @@ test_ner_apple = [
 ]


+def test_las_per_type(en_vocab):
+    # Gold and Doc are identical
+    scorer = Scorer()
+    for input_, annot in test_las_apple:
+        doc = get_doc(
+            en_vocab,
+            words=input_.split(" "),
+            heads=([h - i for i, h in enumerate(annot["heads"])]),
+            deps=annot["deps"],
+        )
+        gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"])
+        scorer.score(doc, gold)
+    results = scorer.scores
+
+    assert results["uas"] == 100
+    assert results["las"] == 100
+    assert results["las_per_type"]["nsubj"]["p"] == 100
+    assert results["las_per_type"]["nsubj"]["r"] == 100
+    assert results["las_per_type"]["nsubj"]["f"] == 100
+    assert results["las_per_type"]["compound"]["p"] == 100
+    assert results["las_per_type"]["compound"]["r"] == 100
+    assert results["las_per_type"]["compound"]["f"] == 100
+
+    # One dep is incorrect in Doc
+    scorer = Scorer()
+    for input_, annot in test_las_apple:
+        doc = get_doc(
+            en_vocab,
+            words=input_.split(" "),
+            heads=([h - i for i, h in enumerate(annot["heads"])]),
+            deps=annot["deps"]
+        )
+        gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"])
+        doc[0].dep_ = "compound"
+        scorer.score(doc, gold)
+    results = scorer.scores
+
+    assert results["uas"] == 100
+    assert_almost_equal(results["las"], 90.9090909)
+    assert results["las_per_type"]["nsubj"]["p"] == 0
+    assert results["las_per_type"]["nsubj"]["r"] == 0
+    assert results["las_per_type"]["nsubj"]["f"] == 0
+    assert_almost_equal(results["las_per_type"]["compound"]["p"], 66.6666666)
+    assert results["las_per_type"]["compound"]["r"] == 100
+    assert results["las_per_type"]["compound"]["f"] == 80
+
+
 def test_ner_per_type(en_vocab):
    # Gold and Doc are identical
    scorer = Scorer()
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@ -1861,6 +1861,30 @@
            "author_links": {
                "github": "microsoft"
            }
+        },
+        {
+            "id": "dframcy",
+            "title": "Dframcy",
+            "slogan": "Dataframe Integration with spaCy NLP",
+            "github": "yash1994/dframcy",
+            "description": "DframCy is a light-weight utility module to integrate Pandas Dataframe to spaCy's linguistic annotation and training tasks.",
+            "pip": "dframcy",
+            "category": ["pipeline", "training"],
+            "tags": ["pandas"],
+            "code_example": [
+                "import spacy",
+                "from dframcy import DframCy",
+                "",
+                "nlp = spacy.load('en_core_web_sm')",
+                "dframcy = DframCy(nlp)",
+                "doc = dframcy.nlp(u'Apple is looking at buying U.K. startup for $1 billion')",
+                "annotation_dataframe = dframcy.to_dataframe(doc)"
+            ],
+            "author": "Yash Patadia",
+            "author_links": {
+                "twitter": "PatadiaYash",
+                "github": "yash1994"
+            }
        }
    ],