mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-03 22:06:37 +03:00
Merge branch 'master' into spacy.io
This commit is contained in:
commit
d7a94edba6
|
@ -4,7 +4,7 @@ preshed>=3.0.2,<3.1.0
|
|||
thinc>=7.3.0,<7.4.0
|
||||
blis>=0.4.0,<0.5.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
wasabi>=0.3.0,<1.1.0
|
||||
wasabi>=0.4.0,<1.1.0
|
||||
srsly>=0.1.0,<1.1.0
|
||||
# Third party dependencies
|
||||
numpy>=1.15.0
|
||||
|
|
10
setup.cfg
10
setup.cfg
|
@ -40,17 +40,19 @@ setup_requires =
|
|||
murmurhash>=0.28.0,<1.1.0
|
||||
thinc>=7.3.0,<7.4.0
|
||||
install_requires =
|
||||
setuptools
|
||||
numpy>=1.15.0
|
||||
# Our libraries
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=7.3.0,<7.4.0
|
||||
blis>=0.4.0,<0.5.0
|
||||
wasabi>=0.4.0,<1.1.0
|
||||
srsly>=0.1.0,<1.1.0
|
||||
# Third-party dependencies
|
||||
setuptools
|
||||
numpy>=1.15.0
|
||||
plac>=0.9.6,<1.2.0
|
||||
requests>=2.13.0,<3.0.0
|
||||
wasabi>=0.3.0,<1.1.0
|
||||
srsly>=0.1.0,<1.1.0
|
||||
pathlib==1.0.1; python_version < "3.4"
|
||||
importlib_metadata>=0.20; python_version < "3.8"
|
||||
|
||||
|
|
|
@ -7,12 +7,10 @@ from __future__ import print_function
|
|||
if __name__ == "__main__":
|
||||
import plac
|
||||
import sys
|
||||
from wasabi import Printer
|
||||
from wasabi import msg
|
||||
from spacy.cli import download, link, info, package, train, pretrain, convert
|
||||
from spacy.cli import init_model, profile, evaluate, validate, debug_data
|
||||
|
||||
msg = Printer()
|
||||
|
||||
commands = {
|
||||
"download": download,
|
||||
"link": link,
|
||||
|
|
|
@ -121,6 +121,8 @@ def debug_data(
|
|||
msg.text("{} training docs".format(len(train_docs)))
|
||||
msg.text("{} evaluation docs".format(len(dev_docs)))
|
||||
|
||||
if not len(dev_docs):
|
||||
msg.fail("No evaluation docs")
|
||||
overlap = len(train_texts.intersection(dev_texts))
|
||||
if overlap:
|
||||
msg.warn("{} training examples also in evaluation data".format(overlap))
|
||||
|
|
|
@ -6,16 +6,13 @@ import requests
|
|||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from wasabi import Printer
|
||||
from wasabi import msg
|
||||
|
||||
from .link import link
|
||||
from ..util import get_package_path
|
||||
from .. import about
|
||||
|
||||
|
||||
msg = Printer()
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
model=("Model to download (shortcut or name)", "positional", None, str),
|
||||
direct=("Force direct download of name + version", "flag", "d", bool),
|
||||
|
|
|
@ -3,7 +3,7 @@ from __future__ import unicode_literals, division, print_function
|
|||
|
||||
import plac
|
||||
from timeit import default_timer as timer
|
||||
from wasabi import Printer
|
||||
from wasabi import msg
|
||||
|
||||
from ..gold import GoldCorpus
|
||||
from .. import util
|
||||
|
@ -32,7 +32,6 @@ def evaluate(
|
|||
Evaluate a model. To render a sample of parses in a HTML file, set an
|
||||
output directory as the displacy_path argument.
|
||||
"""
|
||||
msg = Printer()
|
||||
util.fix_random_seed()
|
||||
if gpu_id >= 0:
|
||||
util.use_gpu(gpu_id)
|
||||
|
|
|
@ -4,7 +4,7 @@ from __future__ import unicode_literals
|
|||
import plac
|
||||
import platform
|
||||
from pathlib import Path
|
||||
from wasabi import Printer
|
||||
from wasabi import msg
|
||||
import srsly
|
||||
|
||||
from ..compat import path2str, basestring_, unicode_
|
||||
|
@ -23,7 +23,6 @@ def info(model=None, markdown=False, silent=False):
|
|||
speficied as an argument, print model information. Flag --markdown
|
||||
prints details in Markdown for easy copy-pasting to GitHub issues.
|
||||
"""
|
||||
msg = Printer()
|
||||
if model:
|
||||
if util.is_package(model):
|
||||
model_path = util.get_package_path(model)
|
||||
|
|
|
@ -11,7 +11,7 @@ import tarfile
|
|||
import gzip
|
||||
import zipfile
|
||||
import srsly
|
||||
from wasabi import Printer
|
||||
from wasabi import msg
|
||||
|
||||
from ..vectors import Vectors
|
||||
from ..errors import Errors, Warnings, user_warning
|
||||
|
@ -24,7 +24,6 @@ except ImportError:
|
|||
|
||||
|
||||
DEFAULT_OOV_PROB = -20
|
||||
msg = Printer()
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
|
|
|
@ -3,7 +3,7 @@ from __future__ import unicode_literals
|
|||
|
||||
import plac
|
||||
from pathlib import Path
|
||||
from wasabi import Printer
|
||||
from wasabi import msg
|
||||
|
||||
from ..compat import symlink_to, path2str
|
||||
from .. import util
|
||||
|
@ -20,7 +20,6 @@ def link(origin, link_name, force=False, model_path=None):
|
|||
either the name of a pip package, or the local path to the model data
|
||||
directory. Linking models allows loading them via spacy.load(link_name).
|
||||
"""
|
||||
msg = Printer()
|
||||
if util.is_package(origin):
|
||||
model_path = util.get_package_path(origin)
|
||||
else:
|
||||
|
|
|
@ -4,7 +4,7 @@ from __future__ import unicode_literals
|
|||
import plac
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from wasabi import Printer, get_raw_input
|
||||
from wasabi import msg, get_raw_input
|
||||
import srsly
|
||||
|
||||
from ..compat import path2str
|
||||
|
@ -27,7 +27,6 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals
|
|||
set and a meta.json already exists in the output directory, the existing
|
||||
values will be used as the defaults in the command-line prompt.
|
||||
"""
|
||||
msg = Printer()
|
||||
input_path = util.ensure_path(input_dir)
|
||||
output_path = util.ensure_path(output_dir)
|
||||
meta_path = util.ensure_path(meta_path)
|
||||
|
|
|
@ -11,7 +11,7 @@ from pathlib import Path
|
|||
from thinc.v2v import Affine, Maxout
|
||||
from thinc.misc import LayerNorm as LN
|
||||
from thinc.neural.util import prefer_gpu
|
||||
from wasabi import Printer
|
||||
from wasabi import msg
|
||||
import srsly
|
||||
|
||||
from ..errors import Errors
|
||||
|
@ -122,7 +122,6 @@ def pretrain(
|
|||
for key in config:
|
||||
if isinstance(config[key], Path):
|
||||
config[key] = str(config[key])
|
||||
msg = Printer()
|
||||
util.fix_random_seed(seed)
|
||||
|
||||
has_gpu = prefer_gpu()
|
||||
|
|
|
@ -9,7 +9,7 @@ import pstats
|
|||
import sys
|
||||
import itertools
|
||||
import thinc.extra.datasets
|
||||
from wasabi import Printer
|
||||
from wasabi import msg
|
||||
|
||||
from ..util import load_model
|
||||
|
||||
|
@ -26,7 +26,6 @@ def profile(model, inputs=None, n_texts=10000):
|
|||
It can either be provided as a JSONL file, or be read from sys.sytdin.
|
||||
If no input file is specified, the IMDB dataset is loaded via Thinc.
|
||||
"""
|
||||
msg = Printer()
|
||||
if inputs is not None:
|
||||
inputs = _read_inputs(inputs, msg)
|
||||
if inputs is None:
|
||||
|
|
|
@ -8,7 +8,7 @@ from thinc.neural._classes.model import Model
|
|||
from timeit import default_timer as timer
|
||||
import shutil
|
||||
import srsly
|
||||
from wasabi import Printer
|
||||
from wasabi import msg
|
||||
import contextlib
|
||||
import random
|
||||
|
||||
|
@ -89,7 +89,6 @@ def train(
|
|||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||
import tqdm
|
||||
|
||||
msg = Printer()
|
||||
util.fix_random_seed()
|
||||
util.set_env_log(verbose)
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@ from pathlib import Path
|
|||
import sys
|
||||
import requests
|
||||
import srsly
|
||||
from wasabi import Printer
|
||||
from wasabi import msg
|
||||
|
||||
from ..compat import path2str
|
||||
from ..util import get_data_path
|
||||
|
@ -17,7 +17,6 @@ def validate():
|
|||
Validate that the currently installed version of spaCy is compatible
|
||||
with the installed models. Should be run after `pip install -U spacy`.
|
||||
"""
|
||||
msg = Printer()
|
||||
with msg.loading("Loading compatibility table..."):
|
||||
r = requests.get(about.__compatibility__)
|
||||
if r.status_code != 200:
|
||||
|
|
|
@ -82,6 +82,7 @@ class Scorer(object):
|
|||
self.sbd = PRFScore()
|
||||
self.unlabelled = PRFScore()
|
||||
self.labelled = PRFScore()
|
||||
self.labelled_per_dep = dict()
|
||||
self.tags = PRFScore()
|
||||
self.ner = PRFScore()
|
||||
self.ner_per_ents = dict()
|
||||
|
@ -124,9 +125,18 @@ class Scorer(object):
|
|||
|
||||
@property
|
||||
def las(self):
|
||||
"""RETURNS (float): Labelled depdendency score."""
|
||||
"""RETURNS (float): Labelled dependency score."""
|
||||
return self.labelled.fscore * 100
|
||||
|
||||
@property
|
||||
def las_per_type(self):
|
||||
"""RETURNS (dict): Scores per dependency label.
|
||||
"""
|
||||
return {
|
||||
k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
|
||||
for k, v in self.labelled_per_dep.items()
|
||||
}
|
||||
|
||||
@property
|
||||
def ents_p(self):
|
||||
"""RETURNS (float): Named entity accuracy (precision)."""
|
||||
|
@ -196,6 +206,7 @@ class Scorer(object):
|
|||
return {
|
||||
"uas": self.uas,
|
||||
"las": self.las,
|
||||
"las_per_type": self.las_per_type,
|
||||
"ents_p": self.ents_p,
|
||||
"ents_r": self.ents_r,
|
||||
"ents_f": self.ents_f,
|
||||
|
@ -223,13 +234,20 @@ class Scorer(object):
|
|||
doc, tuple(zip(*gold.orig_annot)) + (gold.cats,)
|
||||
)
|
||||
gold_deps = set()
|
||||
gold_deps_per_dep = {}
|
||||
gold_tags = set()
|
||||
gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot]))
|
||||
for id_, word, tag, head, dep, ner in gold.orig_annot:
|
||||
gold_tags.add((id_, tag))
|
||||
if dep not in (None, "") and dep.lower() not in punct_labels:
|
||||
gold_deps.add((id_, head, dep.lower()))
|
||||
if dep.lower() not in self.labelled_per_dep:
|
||||
self.labelled_per_dep[dep.lower()] = PRFScore()
|
||||
if dep.lower() not in gold_deps_per_dep:
|
||||
gold_deps_per_dep[dep.lower()] = set()
|
||||
gold_deps_per_dep[dep.lower()].add((id_, head, dep.lower()))
|
||||
cand_deps = set()
|
||||
cand_deps_per_dep = {}
|
||||
cand_tags = set()
|
||||
for token in doc:
|
||||
if token.orth_.isspace():
|
||||
|
@ -249,6 +267,11 @@ class Scorer(object):
|
|||
self.labelled.fp += 1
|
||||
else:
|
||||
cand_deps.add((gold_i, gold_head, token.dep_.lower()))
|
||||
if token.dep_.lower() not in self.labelled_per_dep:
|
||||
self.labelled_per_dep[token.dep_.lower()] = PRFScore()
|
||||
if token.dep_.lower() not in cand_deps_per_dep:
|
||||
cand_deps_per_dep[token.dep_.lower()] = set()
|
||||
cand_deps_per_dep[token.dep_.lower()].add((gold_i, gold_head, token.dep_.lower()))
|
||||
if "-" not in [token[-1] for token in gold.orig_annot]:
|
||||
# Find all NER labels in gold and doc
|
||||
ent_labels = set([x[0] for x in gold_ents] + [k.label_ for k in doc.ents])
|
||||
|
@ -280,6 +303,8 @@ class Scorer(object):
|
|||
self.ner.score_set(cand_ents, gold_ents)
|
||||
self.tags.score_set(cand_tags, gold_tags)
|
||||
self.labelled.score_set(cand_deps, gold_deps)
|
||||
for dep in self.labelled_per_dep:
|
||||
self.labelled_per_dep[dep].score_set(cand_deps_per_dep.get(dep, set()), gold_deps_per_dep.get(dep, set()))
|
||||
self.unlabelled.score_set(
|
||||
set(item[:2] for item in cand_deps), set(item[:2] for item in gold_deps)
|
||||
)
|
||||
|
|
|
@ -9,6 +9,14 @@ from spacy.scorer import Scorer, ROCAUCScore
|
|||
from spacy.scorer import _roc_auc_score, _roc_curve
|
||||
from .util import get_doc
|
||||
|
||||
test_las_apple = [
|
||||
[
|
||||
"Apple is looking at buying U.K. startup for $ 1 billion",
|
||||
{"heads": [2, 2, 2, 2, 3, 6, 4, 4, 10, 10, 7],
|
||||
"deps": ['nsubj', 'aux', 'ROOT', 'prep', 'pcomp', 'compound', 'dobj', 'prep', 'quantmod', 'compound', 'pobj']},
|
||||
]
|
||||
]
|
||||
|
||||
test_ner_cardinal = [
|
||||
["100 - 200", {"entities": [[0, 3, "CARDINAL"], [6, 9, "CARDINAL"]]}]
|
||||
]
|
||||
|
@ -21,6 +29,53 @@ test_ner_apple = [
|
|||
]
|
||||
|
||||
|
||||
def test_las_per_type(en_vocab):
|
||||
# Gold and Doc are identical
|
||||
scorer = Scorer()
|
||||
for input_, annot in test_las_apple:
|
||||
doc = get_doc(
|
||||
en_vocab,
|
||||
words=input_.split(" "),
|
||||
heads=([h - i for i, h in enumerate(annot["heads"])]),
|
||||
deps=annot["deps"],
|
||||
)
|
||||
gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"])
|
||||
scorer.score(doc, gold)
|
||||
results = scorer.scores
|
||||
|
||||
assert results["uas"] == 100
|
||||
assert results["las"] == 100
|
||||
assert results["las_per_type"]["nsubj"]["p"] == 100
|
||||
assert results["las_per_type"]["nsubj"]["r"] == 100
|
||||
assert results["las_per_type"]["nsubj"]["f"] == 100
|
||||
assert results["las_per_type"]["compound"]["p"] == 100
|
||||
assert results["las_per_type"]["compound"]["r"] == 100
|
||||
assert results["las_per_type"]["compound"]["f"] == 100
|
||||
|
||||
# One dep is incorrect in Doc
|
||||
scorer = Scorer()
|
||||
for input_, annot in test_las_apple:
|
||||
doc = get_doc(
|
||||
en_vocab,
|
||||
words=input_.split(" "),
|
||||
heads=([h - i for i, h in enumerate(annot["heads"])]),
|
||||
deps=annot["deps"]
|
||||
)
|
||||
gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"])
|
||||
doc[0].dep_ = "compound"
|
||||
scorer.score(doc, gold)
|
||||
results = scorer.scores
|
||||
|
||||
assert results["uas"] == 100
|
||||
assert_almost_equal(results["las"], 90.9090909)
|
||||
assert results["las_per_type"]["nsubj"]["p"] == 0
|
||||
assert results["las_per_type"]["nsubj"]["r"] == 0
|
||||
assert results["las_per_type"]["nsubj"]["f"] == 0
|
||||
assert_almost_equal(results["las_per_type"]["compound"]["p"], 66.6666666)
|
||||
assert results["las_per_type"]["compound"]["r"] == 100
|
||||
assert results["las_per_type"]["compound"]["f"] == 80
|
||||
|
||||
|
||||
def test_ner_per_type(en_vocab):
|
||||
# Gold and Doc are identical
|
||||
scorer = Scorer()
|
||||
|
|
|
@ -1861,6 +1861,30 @@
|
|||
"author_links": {
|
||||
"github": "microsoft"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "dframcy",
|
||||
"title": "Dframcy",
|
||||
"slogan": "Dataframe Integration with spaCy NLP",
|
||||
"github": "yash1994/dframcy",
|
||||
"description": "DframCy is a light-weight utility module to integrate Pandas Dataframe to spaCy's linguistic annotation and training tasks.",
|
||||
"pip": "dframcy",
|
||||
"category": ["pipeline", "training"],
|
||||
"tags": ["pandas"],
|
||||
"code_example": [
|
||||
"import spacy",
|
||||
"from dframcy import DframCy",
|
||||
"",
|
||||
"nlp = spacy.load('en_core_web_sm')",
|
||||
"dframcy = DframCy(nlp)",
|
||||
"doc = dframcy.nlp(u'Apple is looking at buying U.K. startup for $1 billion')",
|
||||
"annotation_dataframe = dframcy.to_dataframe(doc)"
|
||||
],
|
||||
"author": "Yash Patadia",
|
||||
"author_links": {
|
||||
"twitter": "PatadiaYash",
|
||||
"github": "yash1994"
|
||||
}
|
||||
}
|
||||
],
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user