mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Merge branch 'master' into spacy.io
This commit is contained in:
		
						commit
						d7a94edba6
					
				| 
						 | 
					@ -4,7 +4,7 @@ preshed>=3.0.2,<3.1.0
 | 
				
			||||||
thinc>=7.3.0,<7.4.0
 | 
					thinc>=7.3.0,<7.4.0
 | 
				
			||||||
blis>=0.4.0,<0.5.0
 | 
					blis>=0.4.0,<0.5.0
 | 
				
			||||||
murmurhash>=0.28.0,<1.1.0
 | 
					murmurhash>=0.28.0,<1.1.0
 | 
				
			||||||
wasabi>=0.3.0,<1.1.0
 | 
					wasabi>=0.4.0,<1.1.0
 | 
				
			||||||
srsly>=0.1.0,<1.1.0
 | 
					srsly>=0.1.0,<1.1.0
 | 
				
			||||||
# Third party dependencies
 | 
					# Third party dependencies
 | 
				
			||||||
numpy>=1.15.0
 | 
					numpy>=1.15.0
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										10
									
								
								setup.cfg
									
									
									
									
									
								
							
							
						
						
									
										10
									
								
								setup.cfg
									
									
									
									
									
								
							| 
						 | 
					@ -40,17 +40,19 @@ setup_requires =
 | 
				
			||||||
    murmurhash>=0.28.0,<1.1.0
 | 
					    murmurhash>=0.28.0,<1.1.0
 | 
				
			||||||
    thinc>=7.3.0,<7.4.0
 | 
					    thinc>=7.3.0,<7.4.0
 | 
				
			||||||
install_requires =
 | 
					install_requires =
 | 
				
			||||||
    setuptools
 | 
					    # Our libraries
 | 
				
			||||||
    numpy>=1.15.0
 | 
					 | 
				
			||||||
    murmurhash>=0.28.0,<1.1.0
 | 
					    murmurhash>=0.28.0,<1.1.0
 | 
				
			||||||
    cymem>=2.0.2,<2.1.0
 | 
					    cymem>=2.0.2,<2.1.0
 | 
				
			||||||
    preshed>=3.0.2,<3.1.0
 | 
					    preshed>=3.0.2,<3.1.0
 | 
				
			||||||
    thinc>=7.3.0,<7.4.0
 | 
					    thinc>=7.3.0,<7.4.0
 | 
				
			||||||
    blis>=0.4.0,<0.5.0
 | 
					    blis>=0.4.0,<0.5.0
 | 
				
			||||||
 | 
					    wasabi>=0.4.0,<1.1.0
 | 
				
			||||||
 | 
					    srsly>=0.1.0,<1.1.0
 | 
				
			||||||
 | 
					    # Third-party dependencies
 | 
				
			||||||
 | 
					    setuptools
 | 
				
			||||||
 | 
					    numpy>=1.15.0
 | 
				
			||||||
    plac>=0.9.6,<1.2.0
 | 
					    plac>=0.9.6,<1.2.0
 | 
				
			||||||
    requests>=2.13.0,<3.0.0
 | 
					    requests>=2.13.0,<3.0.0
 | 
				
			||||||
    wasabi>=0.3.0,<1.1.0
 | 
					 | 
				
			||||||
    srsly>=0.1.0,<1.1.0
 | 
					 | 
				
			||||||
    pathlib==1.0.1; python_version < "3.4"
 | 
					    pathlib==1.0.1; python_version < "3.4"
 | 
				
			||||||
    importlib_metadata>=0.20; python_version < "3.8"
 | 
					    importlib_metadata>=0.20; python_version < "3.8"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -7,12 +7,10 @@ from __future__ import print_function
 | 
				
			||||||
if __name__ == "__main__":
 | 
					if __name__ == "__main__":
 | 
				
			||||||
    import plac
 | 
					    import plac
 | 
				
			||||||
    import sys
 | 
					    import sys
 | 
				
			||||||
    from wasabi import Printer
 | 
					    from wasabi import msg
 | 
				
			||||||
    from spacy.cli import download, link, info, package, train, pretrain, convert
 | 
					    from spacy.cli import download, link, info, package, train, pretrain, convert
 | 
				
			||||||
    from spacy.cli import init_model, profile, evaluate, validate, debug_data
 | 
					    from spacy.cli import init_model, profile, evaluate, validate, debug_data
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    msg = Printer()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    commands = {
 | 
					    commands = {
 | 
				
			||||||
        "download": download,
 | 
					        "download": download,
 | 
				
			||||||
        "link": link,
 | 
					        "link": link,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -121,6 +121,8 @@ def debug_data(
 | 
				
			||||||
    msg.text("{} training docs".format(len(train_docs)))
 | 
					    msg.text("{} training docs".format(len(train_docs)))
 | 
				
			||||||
    msg.text("{} evaluation docs".format(len(dev_docs)))
 | 
					    msg.text("{} evaluation docs".format(len(dev_docs)))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if not len(dev_docs):
 | 
				
			||||||
 | 
					        msg.fail("No evaluation docs")
 | 
				
			||||||
    overlap = len(train_texts.intersection(dev_texts))
 | 
					    overlap = len(train_texts.intersection(dev_texts))
 | 
				
			||||||
    if overlap:
 | 
					    if overlap:
 | 
				
			||||||
        msg.warn("{} training examples also in evaluation data".format(overlap))
 | 
					        msg.warn("{} training examples also in evaluation data".format(overlap))
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -6,16 +6,13 @@ import requests
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
import subprocess
 | 
					import subprocess
 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
from wasabi import Printer
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .link import link
 | 
					from .link import link
 | 
				
			||||||
from ..util import get_package_path
 | 
					from ..util import get_package_path
 | 
				
			||||||
from .. import about
 | 
					from .. import about
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
msg = Printer()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@plac.annotations(
 | 
					@plac.annotations(
 | 
				
			||||||
    model=("Model to download (shortcut or name)", "positional", None, str),
 | 
					    model=("Model to download (shortcut or name)", "positional", None, str),
 | 
				
			||||||
    direct=("Force direct download of name + version", "flag", "d", bool),
 | 
					    direct=("Force direct download of name + version", "flag", "d", bool),
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,7 +3,7 @@ from __future__ import unicode_literals, division, print_function
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import plac
 | 
					import plac
 | 
				
			||||||
from timeit import default_timer as timer
 | 
					from timeit import default_timer as timer
 | 
				
			||||||
from wasabi import Printer
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..gold import GoldCorpus
 | 
					from ..gold import GoldCorpus
 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
| 
						 | 
					@ -32,7 +32,6 @@ def evaluate(
 | 
				
			||||||
    Evaluate a model. To render a sample of parses in a HTML file, set an
 | 
					    Evaluate a model. To render a sample of parses in a HTML file, set an
 | 
				
			||||||
    output directory as the displacy_path argument.
 | 
					    output directory as the displacy_path argument.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    msg = Printer()
 | 
					 | 
				
			||||||
    util.fix_random_seed()
 | 
					    util.fix_random_seed()
 | 
				
			||||||
    if gpu_id >= 0:
 | 
					    if gpu_id >= 0:
 | 
				
			||||||
        util.use_gpu(gpu_id)
 | 
					        util.use_gpu(gpu_id)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,7 +4,7 @@ from __future__ import unicode_literals
 | 
				
			||||||
import plac
 | 
					import plac
 | 
				
			||||||
import platform
 | 
					import platform
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from wasabi import Printer
 | 
					from wasabi import msg
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..compat import path2str, basestring_, unicode_
 | 
					from ..compat import path2str, basestring_, unicode_
 | 
				
			||||||
| 
						 | 
					@ -23,7 +23,6 @@ def info(model=None, markdown=False, silent=False):
 | 
				
			||||||
    speficied as an argument, print model information. Flag --markdown
 | 
					    speficied as an argument, print model information. Flag --markdown
 | 
				
			||||||
    prints details in Markdown for easy copy-pasting to GitHub issues.
 | 
					    prints details in Markdown for easy copy-pasting to GitHub issues.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    msg = Printer()
 | 
					 | 
				
			||||||
    if model:
 | 
					    if model:
 | 
				
			||||||
        if util.is_package(model):
 | 
					        if util.is_package(model):
 | 
				
			||||||
            model_path = util.get_package_path(model)
 | 
					            model_path = util.get_package_path(model)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -11,7 +11,7 @@ import tarfile
 | 
				
			||||||
import gzip
 | 
					import gzip
 | 
				
			||||||
import zipfile
 | 
					import zipfile
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
from wasabi import Printer
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..vectors import Vectors
 | 
					from ..vectors import Vectors
 | 
				
			||||||
from ..errors import Errors, Warnings, user_warning
 | 
					from ..errors import Errors, Warnings, user_warning
 | 
				
			||||||
| 
						 | 
					@ -24,7 +24,6 @@ except ImportError:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_OOV_PROB = -20
 | 
					DEFAULT_OOV_PROB = -20
 | 
				
			||||||
msg = Printer()
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@plac.annotations(
 | 
					@plac.annotations(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,7 +3,7 @@ from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import plac
 | 
					import plac
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from wasabi import Printer
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..compat import symlink_to, path2str
 | 
					from ..compat import symlink_to, path2str
 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
| 
						 | 
					@ -20,7 +20,6 @@ def link(origin, link_name, force=False, model_path=None):
 | 
				
			||||||
    either the name of a pip package, or the local path to the model data
 | 
					    either the name of a pip package, or the local path to the model data
 | 
				
			||||||
    directory. Linking models allows loading them via spacy.load(link_name).
 | 
					    directory. Linking models allows loading them via spacy.load(link_name).
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    msg = Printer()
 | 
					 | 
				
			||||||
    if util.is_package(origin):
 | 
					    if util.is_package(origin):
 | 
				
			||||||
        model_path = util.get_package_path(origin)
 | 
					        model_path = util.get_package_path(origin)
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,7 +4,7 @@ from __future__ import unicode_literals
 | 
				
			||||||
import plac
 | 
					import plac
 | 
				
			||||||
import shutil
 | 
					import shutil
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from wasabi import Printer, get_raw_input
 | 
					from wasabi import msg, get_raw_input
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..compat import path2str
 | 
					from ..compat import path2str
 | 
				
			||||||
| 
						 | 
					@ -27,7 +27,6 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals
 | 
				
			||||||
    set and a meta.json already exists in the output directory, the existing
 | 
					    set and a meta.json already exists in the output directory, the existing
 | 
				
			||||||
    values will be used as the defaults in the command-line prompt.
 | 
					    values will be used as the defaults in the command-line prompt.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    msg = Printer()
 | 
					 | 
				
			||||||
    input_path = util.ensure_path(input_dir)
 | 
					    input_path = util.ensure_path(input_dir)
 | 
				
			||||||
    output_path = util.ensure_path(output_dir)
 | 
					    output_path = util.ensure_path(output_dir)
 | 
				
			||||||
    meta_path = util.ensure_path(meta_path)
 | 
					    meta_path = util.ensure_path(meta_path)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -11,7 +11,7 @@ from pathlib import Path
 | 
				
			||||||
from thinc.v2v import Affine, Maxout
 | 
					from thinc.v2v import Affine, Maxout
 | 
				
			||||||
from thinc.misc import LayerNorm as LN
 | 
					from thinc.misc import LayerNorm as LN
 | 
				
			||||||
from thinc.neural.util import prefer_gpu
 | 
					from thinc.neural.util import prefer_gpu
 | 
				
			||||||
from wasabi import Printer
 | 
					from wasabi import msg
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..errors import Errors
 | 
					from ..errors import Errors
 | 
				
			||||||
| 
						 | 
					@ -122,7 +122,6 @@ def pretrain(
 | 
				
			||||||
    for key in config:
 | 
					    for key in config:
 | 
				
			||||||
        if isinstance(config[key], Path):
 | 
					        if isinstance(config[key], Path):
 | 
				
			||||||
            config[key] = str(config[key])
 | 
					            config[key] = str(config[key])
 | 
				
			||||||
    msg = Printer()
 | 
					 | 
				
			||||||
    util.fix_random_seed(seed)
 | 
					    util.fix_random_seed(seed)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    has_gpu = prefer_gpu()
 | 
					    has_gpu = prefer_gpu()
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -9,7 +9,7 @@ import pstats
 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
import itertools
 | 
					import itertools
 | 
				
			||||||
import thinc.extra.datasets
 | 
					import thinc.extra.datasets
 | 
				
			||||||
from wasabi import Printer
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..util import load_model
 | 
					from ..util import load_model
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -26,7 +26,6 @@ def profile(model, inputs=None, n_texts=10000):
 | 
				
			||||||
    It can either be provided as a JSONL file, or be read from sys.sytdin.
 | 
					    It can either be provided as a JSONL file, or be read from sys.sytdin.
 | 
				
			||||||
    If no input file is specified, the IMDB dataset is loaded via Thinc.
 | 
					    If no input file is specified, the IMDB dataset is loaded via Thinc.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    msg = Printer()
 | 
					 | 
				
			||||||
    if inputs is not None:
 | 
					    if inputs is not None:
 | 
				
			||||||
        inputs = _read_inputs(inputs, msg)
 | 
					        inputs = _read_inputs(inputs, msg)
 | 
				
			||||||
    if inputs is None:
 | 
					    if inputs is None:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -8,7 +8,7 @@ from thinc.neural._classes.model import Model
 | 
				
			||||||
from timeit import default_timer as timer
 | 
					from timeit import default_timer as timer
 | 
				
			||||||
import shutil
 | 
					import shutil
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
from wasabi import Printer
 | 
					from wasabi import msg
 | 
				
			||||||
import contextlib
 | 
					import contextlib
 | 
				
			||||||
import random
 | 
					import random
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -89,7 +89,6 @@ def train(
 | 
				
			||||||
    # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
 | 
					    # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
 | 
				
			||||||
    import tqdm
 | 
					    import tqdm
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    msg = Printer()
 | 
					 | 
				
			||||||
    util.fix_random_seed()
 | 
					    util.fix_random_seed()
 | 
				
			||||||
    util.set_env_log(verbose)
 | 
					    util.set_env_log(verbose)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -5,7 +5,7 @@ from pathlib import Path
 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
import requests
 | 
					import requests
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
from wasabi import Printer
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..compat import path2str
 | 
					from ..compat import path2str
 | 
				
			||||||
from ..util import get_data_path
 | 
					from ..util import get_data_path
 | 
				
			||||||
| 
						 | 
					@ -17,7 +17,6 @@ def validate():
 | 
				
			||||||
    Validate that the currently installed version of spaCy is compatible
 | 
					    Validate that the currently installed version of spaCy is compatible
 | 
				
			||||||
    with the installed models. Should be run after `pip install -U spacy`.
 | 
					    with the installed models. Should be run after `pip install -U spacy`.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    msg = Printer()
 | 
					 | 
				
			||||||
    with msg.loading("Loading compatibility table..."):
 | 
					    with msg.loading("Loading compatibility table..."):
 | 
				
			||||||
        r = requests.get(about.__compatibility__)
 | 
					        r = requests.get(about.__compatibility__)
 | 
				
			||||||
        if r.status_code != 200:
 | 
					        if r.status_code != 200:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -82,6 +82,7 @@ class Scorer(object):
 | 
				
			||||||
        self.sbd = PRFScore()
 | 
					        self.sbd = PRFScore()
 | 
				
			||||||
        self.unlabelled = PRFScore()
 | 
					        self.unlabelled = PRFScore()
 | 
				
			||||||
        self.labelled = PRFScore()
 | 
					        self.labelled = PRFScore()
 | 
				
			||||||
 | 
					        self.labelled_per_dep = dict()
 | 
				
			||||||
        self.tags = PRFScore()
 | 
					        self.tags = PRFScore()
 | 
				
			||||||
        self.ner = PRFScore()
 | 
					        self.ner = PRFScore()
 | 
				
			||||||
        self.ner_per_ents = dict()
 | 
					        self.ner_per_ents = dict()
 | 
				
			||||||
| 
						 | 
					@ -124,9 +125,18 @@ class Scorer(object):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
    def las(self):
 | 
					    def las(self):
 | 
				
			||||||
        """RETURNS (float): Labelled depdendency score."""
 | 
					        """RETURNS (float): Labelled dependency score."""
 | 
				
			||||||
        return self.labelled.fscore * 100
 | 
					        return self.labelled.fscore * 100
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @property
 | 
				
			||||||
 | 
					    def las_per_type(self):
 | 
				
			||||||
 | 
					        """RETURNS (dict): Scores per dependency label.
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        return {
 | 
				
			||||||
 | 
					            k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
 | 
				
			||||||
 | 
					            for k, v in self.labelled_per_dep.items()
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
    def ents_p(self):
 | 
					    def ents_p(self):
 | 
				
			||||||
        """RETURNS (float): Named entity accuracy (precision)."""
 | 
					        """RETURNS (float): Named entity accuracy (precision)."""
 | 
				
			||||||
| 
						 | 
					@ -196,6 +206,7 @@ class Scorer(object):
 | 
				
			||||||
        return {
 | 
					        return {
 | 
				
			||||||
            "uas": self.uas,
 | 
					            "uas": self.uas,
 | 
				
			||||||
            "las": self.las,
 | 
					            "las": self.las,
 | 
				
			||||||
 | 
					            "las_per_type": self.las_per_type,
 | 
				
			||||||
            "ents_p": self.ents_p,
 | 
					            "ents_p": self.ents_p,
 | 
				
			||||||
            "ents_r": self.ents_r,
 | 
					            "ents_r": self.ents_r,
 | 
				
			||||||
            "ents_f": self.ents_f,
 | 
					            "ents_f": self.ents_f,
 | 
				
			||||||
| 
						 | 
					@ -223,13 +234,20 @@ class Scorer(object):
 | 
				
			||||||
                doc, tuple(zip(*gold.orig_annot)) + (gold.cats,)
 | 
					                doc, tuple(zip(*gold.orig_annot)) + (gold.cats,)
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
        gold_deps = set()
 | 
					        gold_deps = set()
 | 
				
			||||||
 | 
					        gold_deps_per_dep = {}
 | 
				
			||||||
        gold_tags = set()
 | 
					        gold_tags = set()
 | 
				
			||||||
        gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot]))
 | 
					        gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot]))
 | 
				
			||||||
        for id_, word, tag, head, dep, ner in gold.orig_annot:
 | 
					        for id_, word, tag, head, dep, ner in gold.orig_annot:
 | 
				
			||||||
            gold_tags.add((id_, tag))
 | 
					            gold_tags.add((id_, tag))
 | 
				
			||||||
            if dep not in (None, "") and dep.lower() not in punct_labels:
 | 
					            if dep not in (None, "") and dep.lower() not in punct_labels:
 | 
				
			||||||
                gold_deps.add((id_, head, dep.lower()))
 | 
					                gold_deps.add((id_, head, dep.lower()))
 | 
				
			||||||
 | 
					                if dep.lower() not in self.labelled_per_dep:
 | 
				
			||||||
 | 
					                    self.labelled_per_dep[dep.lower()] = PRFScore()
 | 
				
			||||||
 | 
					                if dep.lower() not in gold_deps_per_dep:
 | 
				
			||||||
 | 
					                    gold_deps_per_dep[dep.lower()] = set()
 | 
				
			||||||
 | 
					                gold_deps_per_dep[dep.lower()].add((id_, head, dep.lower()))
 | 
				
			||||||
        cand_deps = set()
 | 
					        cand_deps = set()
 | 
				
			||||||
 | 
					        cand_deps_per_dep = {}
 | 
				
			||||||
        cand_tags = set()
 | 
					        cand_tags = set()
 | 
				
			||||||
        for token in doc:
 | 
					        for token in doc:
 | 
				
			||||||
            if token.orth_.isspace():
 | 
					            if token.orth_.isspace():
 | 
				
			||||||
| 
						 | 
					@ -249,6 +267,11 @@ class Scorer(object):
 | 
				
			||||||
                    self.labelled.fp += 1
 | 
					                    self.labelled.fp += 1
 | 
				
			||||||
                else:
 | 
					                else:
 | 
				
			||||||
                    cand_deps.add((gold_i, gold_head, token.dep_.lower()))
 | 
					                    cand_deps.add((gold_i, gold_head, token.dep_.lower()))
 | 
				
			||||||
 | 
					                    if token.dep_.lower() not in self.labelled_per_dep:
 | 
				
			||||||
 | 
					                        self.labelled_per_dep[token.dep_.lower()] = PRFScore()
 | 
				
			||||||
 | 
					                    if token.dep_.lower() not in cand_deps_per_dep:
 | 
				
			||||||
 | 
					                        cand_deps_per_dep[token.dep_.lower()] = set()
 | 
				
			||||||
 | 
					                    cand_deps_per_dep[token.dep_.lower()].add((gold_i, gold_head, token.dep_.lower()))
 | 
				
			||||||
        if "-" not in [token[-1] for token in gold.orig_annot]:
 | 
					        if "-" not in [token[-1] for token in gold.orig_annot]:
 | 
				
			||||||
            # Find all NER labels in gold and doc
 | 
					            # Find all NER labels in gold and doc
 | 
				
			||||||
            ent_labels = set([x[0] for x in gold_ents] + [k.label_ for k in doc.ents])
 | 
					            ent_labels = set([x[0] for x in gold_ents] + [k.label_ for k in doc.ents])
 | 
				
			||||||
| 
						 | 
					@ -280,6 +303,8 @@ class Scorer(object):
 | 
				
			||||||
            self.ner.score_set(cand_ents, gold_ents)
 | 
					            self.ner.score_set(cand_ents, gold_ents)
 | 
				
			||||||
        self.tags.score_set(cand_tags, gold_tags)
 | 
					        self.tags.score_set(cand_tags, gold_tags)
 | 
				
			||||||
        self.labelled.score_set(cand_deps, gold_deps)
 | 
					        self.labelled.score_set(cand_deps, gold_deps)
 | 
				
			||||||
 | 
					        for dep in self.labelled_per_dep:
 | 
				
			||||||
 | 
					            self.labelled_per_dep[dep].score_set(cand_deps_per_dep.get(dep, set()), gold_deps_per_dep.get(dep, set()))
 | 
				
			||||||
        self.unlabelled.score_set(
 | 
					        self.unlabelled.score_set(
 | 
				
			||||||
            set(item[:2] for item in cand_deps), set(item[:2] for item in gold_deps)
 | 
					            set(item[:2] for item in cand_deps), set(item[:2] for item in gold_deps)
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -9,6 +9,14 @@ from spacy.scorer import Scorer, ROCAUCScore
 | 
				
			||||||
from spacy.scorer import _roc_auc_score, _roc_curve
 | 
					from spacy.scorer import _roc_auc_score, _roc_curve
 | 
				
			||||||
from .util import get_doc
 | 
					from .util import get_doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					test_las_apple = [
 | 
				
			||||||
 | 
					    [
 | 
				
			||||||
 | 
					        "Apple is looking at buying U.K. startup for $ 1 billion",
 | 
				
			||||||
 | 
					        {"heads": [2, 2, 2, 2, 3, 6, 4, 4, 10, 10, 7],
 | 
				
			||||||
 | 
					         "deps": ['nsubj', 'aux', 'ROOT', 'prep', 'pcomp', 'compound', 'dobj', 'prep', 'quantmod', 'compound', 'pobj']},
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
test_ner_cardinal = [
 | 
					test_ner_cardinal = [
 | 
				
			||||||
    ["100 - 200", {"entities": [[0, 3, "CARDINAL"], [6, 9, "CARDINAL"]]}]
 | 
					    ["100 - 200", {"entities": [[0, 3, "CARDINAL"], [6, 9, "CARDINAL"]]}]
 | 
				
			||||||
]
 | 
					]
 | 
				
			||||||
| 
						 | 
					@ -21,6 +29,53 @@ test_ner_apple = [
 | 
				
			||||||
]
 | 
					]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_las_per_type(en_vocab):
 | 
				
			||||||
 | 
					    # Gold and Doc are identical
 | 
				
			||||||
 | 
					    scorer = Scorer()
 | 
				
			||||||
 | 
					    for input_, annot in test_las_apple:
 | 
				
			||||||
 | 
					        doc = get_doc(
 | 
				
			||||||
 | 
					            en_vocab,
 | 
				
			||||||
 | 
					            words=input_.split(" "),
 | 
				
			||||||
 | 
					            heads=([h - i for i, h in enumerate(annot["heads"])]),
 | 
				
			||||||
 | 
					            deps=annot["deps"],
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"])
 | 
				
			||||||
 | 
					        scorer.score(doc, gold)
 | 
				
			||||||
 | 
					    results = scorer.scores
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert results["uas"] == 100
 | 
				
			||||||
 | 
					    assert results["las"] == 100
 | 
				
			||||||
 | 
					    assert results["las_per_type"]["nsubj"]["p"] == 100
 | 
				
			||||||
 | 
					    assert results["las_per_type"]["nsubj"]["r"] == 100
 | 
				
			||||||
 | 
					    assert results["las_per_type"]["nsubj"]["f"] == 100
 | 
				
			||||||
 | 
					    assert results["las_per_type"]["compound"]["p"] == 100
 | 
				
			||||||
 | 
					    assert results["las_per_type"]["compound"]["r"] == 100
 | 
				
			||||||
 | 
					    assert results["las_per_type"]["compound"]["f"] == 100
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # One dep is incorrect in Doc
 | 
				
			||||||
 | 
					    scorer = Scorer()
 | 
				
			||||||
 | 
					    for input_, annot in test_las_apple:
 | 
				
			||||||
 | 
					        doc = get_doc(
 | 
				
			||||||
 | 
					            en_vocab,
 | 
				
			||||||
 | 
					            words=input_.split(" "),
 | 
				
			||||||
 | 
					            heads=([h - i for i, h in enumerate(annot["heads"])]),
 | 
				
			||||||
 | 
					            deps=annot["deps"]
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"])
 | 
				
			||||||
 | 
					        doc[0].dep_ = "compound"
 | 
				
			||||||
 | 
					        scorer.score(doc, gold)
 | 
				
			||||||
 | 
					    results = scorer.scores
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert results["uas"] == 100
 | 
				
			||||||
 | 
					    assert_almost_equal(results["las"], 90.9090909)
 | 
				
			||||||
 | 
					    assert results["las_per_type"]["nsubj"]["p"] == 0
 | 
				
			||||||
 | 
					    assert results["las_per_type"]["nsubj"]["r"] == 0
 | 
				
			||||||
 | 
					    assert results["las_per_type"]["nsubj"]["f"] == 0
 | 
				
			||||||
 | 
					    assert_almost_equal(results["las_per_type"]["compound"]["p"], 66.6666666)
 | 
				
			||||||
 | 
					    assert results["las_per_type"]["compound"]["r"] == 100
 | 
				
			||||||
 | 
					    assert results["las_per_type"]["compound"]["f"] == 80
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_ner_per_type(en_vocab):
 | 
					def test_ner_per_type(en_vocab):
 | 
				
			||||||
    # Gold and Doc are identical
 | 
					    # Gold and Doc are identical
 | 
				
			||||||
    scorer = Scorer()
 | 
					    scorer = Scorer()
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1861,6 +1861,30 @@
 | 
				
			||||||
            "author_links": {
 | 
					            "author_links": {
 | 
				
			||||||
                "github": "microsoft"
 | 
					                "github": "microsoft"
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					            "id": "dframcy",
 | 
				
			||||||
 | 
					            "title": "Dframcy",
 | 
				
			||||||
 | 
					            "slogan": "Dataframe Integration with spaCy NLP",
 | 
				
			||||||
 | 
					            "github": "yash1994/dframcy",
 | 
				
			||||||
 | 
					            "description": "DframCy is a light-weight utility module to integrate Pandas Dataframe to spaCy's linguistic annotation and training tasks.",
 | 
				
			||||||
 | 
					            "pip": "dframcy",
 | 
				
			||||||
 | 
					            "category": ["pipeline", "training"],
 | 
				
			||||||
 | 
					            "tags": ["pandas"],
 | 
				
			||||||
 | 
					            "code_example": [
 | 
				
			||||||
 | 
					                "import spacy",
 | 
				
			||||||
 | 
					                "from dframcy import DframCy",
 | 
				
			||||||
 | 
					                "",
 | 
				
			||||||
 | 
					                "nlp = spacy.load('en_core_web_sm')",
 | 
				
			||||||
 | 
					                "dframcy = DframCy(nlp)",
 | 
				
			||||||
 | 
					                "doc = dframcy.nlp(u'Apple is looking at buying U.K. startup for $1 billion')",
 | 
				
			||||||
 | 
					                "annotation_dataframe = dframcy.to_dataframe(doc)"
 | 
				
			||||||
 | 
					            ],
 | 
				
			||||||
 | 
					            "author": "Yash Patadia",
 | 
				
			||||||
 | 
					            "author_links": {
 | 
				
			||||||
 | 
					                "twitter": "PatadiaYash",
 | 
				
			||||||
 | 
					                "github": "yash1994"
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
    ],
 | 
					    ],
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user