From f8c4ee34fe4c3ae2be41d24b240aa2d795fe687e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 7 Dec 2018 01:43:07 +0100 Subject: [PATCH 01/11] Update wasabi pin --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8213bac7b..dd8631c14 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ preshed>=2.0.1,<2.1.0 thinc==7.0.0.dev6 blis>=0.2.2,<0.3.0 murmurhash>=0.28.0,<1.1.0 -wasabi>=0.0.8,<1.1.0 +wasabi>=0.0.10,<1.1.0 srsly>=0.0.5,<1.1.0 # Third party dependencies numpy>=1.15.0 From b2bfd1e1c8fb1fe038f2da1fc3b188f42bdc1041 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 7 Dec 2018 20:54:35 +0100 Subject: [PATCH 02/11] Move dropout and batch sizes out of global scope in train cmd --- spacy/cli/train.py | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 8d322e32d..90decdc12 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -18,22 +18,6 @@ from .. import util from .. import about -# Take dropout and batch size as generators of values -- dropout -# starts high and decays sharply, to force the optimizer to explore. -# Batch size starts at 1 and grows, so that we make updates quickly -# at the beginning of training. -dropout_rates = util.decaying( - util.env_opt("dropout_from", 0.1), - util.env_opt("dropout_to", 0.1), - util.env_opt("dropout_decay", 0.0), -) -batch_sizes = util.compounding( - util.env_opt("batch_from", 750), - util.env_opt("batch_to", 750), - util.env_opt("batch_compound", 1.001), -) - - @plac.annotations( lang=("Model language", "positional", None, str), output_path=("Output directory to store model in", "positional", None, Path), @@ -120,6 +104,21 @@ def train( if not output_path.exists(): output_path.mkdir() + # Take dropout and batch size as generators of values -- dropout + # starts high and decays sharply, to force the optimizer to explore. + # Batch size starts at 1 and grows, so that we make updates quickly + # at the beginning of training. + dropout_rates = util.decaying( + util.env_opt("dropout_from", 0.1), + util.env_opt("dropout_to", 0.1), + util.env_opt("dropout_decay", 0.0), + ) + batch_sizes = util.compounding( + util.env_opt("batch_from", 100.0), + util.env_opt("batch_to", 1000.0), + util.env_opt("batch_compound", 1.001), + ) + # Set up the base model and pipeline. If a base model is specified, load # the model and make sure the pipeline matches the pipeline setting. If # training starts from a blank model, intitalize the language class. From 6f36b6bc4eb70e0e536ebcc7646329d73a7d3bb7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 7 Dec 2018 23:42:48 +0100 Subject: [PATCH 03/11] Pin pex version --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 34566d80c..2834096b7 100644 --- a/Makefile +++ b/Makefile @@ -5,11 +5,11 @@ dist/spacy.pex : spacy/*.py* spacy/*/*.py* python3.6 -m venv env3.6 source env3.6/bin/activate env3.6/bin/pip install wheel - env3.6/bin/pip install -r requirements.txt --no-cache-dir --no-binary :all: + env3.6/bin/pip install -r requirements.txt --no-cache-dir env3.6/bin/python setup.py build_ext --inplace env3.6/bin/python setup.py sdist env3.6/bin/python setup.py bdist_wheel - env3.6/bin/python -m pip install pex + env3.6/bin/python -m pip install pex==1.5.3 env3.6/bin/pex pytest dist/*.whl -e spacy -o dist/spacy-$(sha).pex cp dist/spacy-$(sha).pex dist/spacy.pex chmod a+rx dist/spacy.pex From a338c6f8f64906f9f4c8b109779a11b717c4792e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 8 Dec 2018 10:41:24 +0100 Subject: [PATCH 04/11] Fix JSON segmentation bug that affected French Fix a bug in the JSON streaming code that GoldCorpus uses. Escaped slashes were being handled incorrectly. This bug caused low scores for French in the early v2.1.0 alphas, because most of the data was not being read in. Fittingly, the document that triggered the bug was a Wikipedia article about Perl. Parsing perl remains difficult! --- spacy/gold.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 8bdd42a83..69e256167 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -346,12 +346,12 @@ def _json_iterate(loc): cdef char close_curly = ord('}') for i in range(len(py_raw)): c = raw[i] - if c == backslash: - escape = True - continue if escape: escape = False continue + if c == backslash: + escape = True + continue if c == quote: inside_string = not inside_string continue From 8aa7882762e9be1af8c42885bb874a89f2c730b3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 8 Dec 2018 10:49:10 +0100 Subject: [PATCH 05/11] Make NORM a token attribute (#3029) See #3028. The solution in this patch is pretty debateable. What we do is give the TokenC struct a .norm field, by repurposing the previously idle .sense attribute. It's nice to repurpose a previous field because it means the TokenC doesn't change size, so even if someone's using the internals very deeply, nothing will break. The weird thing here is that the TokenC and the LexemeC both have an attribute named NORM. This arguably assists in backwards compatibility. On the other hand, maybe it's really bad! We're changing the semantics of the attribute subtly, so maybe it's better if someone calling lex.norm gets a breakage, and instead is told to write lex.default_norm? Overall I believe this patch makes the NORM feature work the way we sort of expected it to work. Certainly it's much more like how the docs describe it, and more in line with how we've been directing people to use the norm attribute. We'll also be able to use token.norm to do stuff like spelling correction, which is pretty cool. --- spacy/structs.pxd | 2 +- spacy/tests/regression/test_issue2754.py | 14 ++++++++++++++ spacy/tokens/token.pxd | 7 +++++++ spacy/tokens/token.pyx | 10 ++++++++-- spacy/vocab.pyx | 7 +++++-- 5 files changed, 35 insertions(+), 5 deletions(-) create mode 100644 spacy/tests/regression/test_issue2754.py diff --git a/spacy/structs.pxd b/spacy/structs.pxd index cfcadc3d0..fa282cae7 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -58,7 +58,7 @@ cdef struct TokenC: attr_t tag int idx attr_t lemma - attr_t sense + attr_t norm int head attr_t dep diff --git a/spacy/tests/regression/test_issue2754.py b/spacy/tests/regression/test_issue2754.py new file mode 100644 index 000000000..5f76727f8 --- /dev/null +++ b/spacy/tests/regression/test_issue2754.py @@ -0,0 +1,14 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest +from spacy.lang.en import English + +def test_issue2754(): + """Test that words like 'a' and 'a.m.' don't get exceptional norm values.""" + nlp = English() + a = nlp('a') + assert a[0].norm_ == 'a' + am = nlp('am') + assert am[0].norm_ == 'am' + diff --git a/spacy/tokens/token.pxd b/spacy/tokens/token.pxd index 9b02d07fb..bb9f7d070 100644 --- a/spacy/tokens/token.pxd +++ b/spacy/tokens/token.pxd @@ -34,6 +34,11 @@ cdef class Token: return Lexeme.c_check_flag(token.lex, feat_name) elif feat_name == LEMMA: return token.lemma + elif feat_name == NORM: + if token.norm == 0: + return token.lex.norm + else: + return token.norm elif feat_name == POS: return token.pos elif feat_name == TAG: @@ -58,6 +63,8 @@ cdef class Token: attr_t value) nogil: if feat_name == LEMMA: token.lemma = value + elif feat_name == NORM: + token.norm = value elif feat_name == POS: token.pos = value elif feat_name == TAG: diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 5c8af1333..0266004b5 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -249,7 +249,10 @@ cdef class Token: or norm exceptions. """ def __get__(self): - return self.c.lex.norm + if self.c.norm == 0: + return self.c.lex.norm + else: + return self.c.norm property shape: """RETURNS (uint64): ID of the token's shape, a transform of the @@ -711,7 +714,10 @@ cdef class Token: norm exceptions. """ def __get__(self): - return self.vocab.strings[self.c.lex.norm] + return self.vocab.strings[self.norm] + + def __set__(self, unicode norm_): + self.c.norm = self.vocab.strings.add(norm_) property shape_: """RETURNS (unicode): Transform of the tokens's string, to show diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 42fd2f46e..e28aa0b86 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -17,7 +17,7 @@ from .structs cimport SerializedLexemeC from .compat import copy_reg, basestring_ from .errors import Errors from .lemmatizer import Lemmatizer -from .attrs import intify_attrs +from .attrs import intify_attrs, NORM from .vectors import Vectors from ._ml import link_vectors_to_models from . import util @@ -234,7 +234,10 @@ cdef class Vocab: self.morphology.assign_tag(token, props[TAG]) for attr_id, value in props.items(): Token.set_struct_attr(token, attr_id, value) - Lexeme.set_struct_attr(lex, attr_id, value) + # NORM is the only one that overlaps between the two + # (which is maybe not great?) + if attr_id != NORM: + Lexeme.set_struct_attr(lex, attr_id, value) return tokens @property From ffdd5e964f0f41a3140a6df5e967bfd277f57f2a Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 8 Dec 2018 11:49:43 +0100 Subject: [PATCH 06/11] Small CLI improvements (#3030) * Add todo * Auto-format * Update wasabi pin * Format training results with wasabi * Remove loading animation from model saving Currently behaves weirdly * Inline messages * Remove unnecessary path2str Already taken care of by printer * Inline messages in CLI * Remove unused function * Move loading indicator into loading function * Check for invalid whitespace entities --- requirements.txt | 2 +- setup.py | 12 ++-- spacy/cli/_messages.py | 105 ----------------------------- spacy/cli/convert.py | 17 ++--- spacy/cli/converters/jsonl2json.py | 3 +- spacy/cli/debug_data.py | 47 +++++++------ spacy/cli/download.py | 22 ++++-- spacy/cli/evaluate.py | 43 +----------- spacy/cli/info.py | 3 +- spacy/cli/init_model.py | 17 +++-- spacy/cli/link.py | 38 ++++++++--- spacy/cli/package.py | 32 +++++---- spacy/cli/train.py | 105 +++++++++++++++++------------ spacy/cli/validate.py | 24 +++++-- 14 files changed, 199 insertions(+), 271 deletions(-) delete mode 100644 spacy/cli/_messages.py diff --git a/requirements.txt b/requirements.txt index dd8631c14..5cedaf835 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ preshed>=2.0.1,<2.1.0 thinc==7.0.0.dev6 blis>=0.2.2,<0.3.0 murmurhash>=0.28.0,<1.1.0 -wasabi>=0.0.10,<1.1.0 +wasabi>=0.0.12,<1.1.0 srsly>=0.0.5,<1.1.0 # Third party dependencies numpy>=1.15.0 diff --git a/setup.py b/setup.py index 90d2dfb6f..5e6184223 100755 --- a/setup.py +++ b/setup.py @@ -13,12 +13,12 @@ from setuptools import Extension, setup, find_packages def is_new_osx(): - '''Check whether we're on OSX >= 10.10''' + """Check whether we're on OSX >= 10.10""" name = distutils.util.get_platform() - if sys.platform != 'darwin': + if sys.platform != "darwin": return False - elif name.startswith('macosx-10'): - minor_version = int(name.split('-')[1].split('.')[1]) + elif name.startswith("macosx-10"): + minor_version = int(name.split("-")[1].split(".")[1]) if minor_version >= 7: return True else: @@ -27,7 +27,6 @@ def is_new_osx(): return False - PACKAGE_DATA = {"": ["*.pyx", "*.pxd", "*.txt", "*.tokens"]} @@ -84,7 +83,6 @@ if is_new_osx(): LINK_OPTIONS["other"].append("-nodefaultlibs") - USE_OPENMP_DEFAULT = "0" if sys.platform != "darwin" else None if os.environ.get("USE_OPENMP", USE_OPENMP_DEFAULT) == "1": if sys.platform == "darwin": @@ -232,7 +230,7 @@ def setup_package(): "regex==2018.01.10", "requests>=2.13.0,<3.0.0", "jsonschema>=2.6.0,<3.0.0", - "wasabi>=0.0.8,<1.1.0", + "wasabi>=0.0.12,<1.1.0", "srsly>=0.0.5,<1.1.0", 'pathlib==1.0.1; python_version < "3.4"', ], diff --git a/spacy/cli/_messages.py b/spacy/cli/_messages.py deleted file mode 100644 index 2ac6599c5..000000000 --- a/spacy/cli/_messages.py +++ /dev/null @@ -1,105 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - - -# fmt: off - -class Messages(object): - M001 = ("Download successful but linking failed") - M002 = ("Creating a shortcut link for 'en' didn't work (maybe you " - "don't have admin permissions?), but you can still load the " - "model via its full package name: nlp = spacy.load('{name}')") - M003 = ("Server error ({code})") - M004 = ("Couldn't fetch {desc}. Please find a model for your spaCy " - "installation (v{version}), and download it manually. For more " - "details, see the documentation: https://spacy.io/usage/models") - M005 = ("Compatibility error") - M006 = ("No compatible models found for v{version} of spaCy.") - M007 = ("No compatible model found for '{name}' (spaCy v{version}).") - M008 = ("Can't locate model data") - M009 = ("The data should be located in {path}") - M010 = ("Can't find the spaCy data path to create model symlink") - M011 = ("Make sure a directory `/data` exists within your spaCy " - "installation and try again. The data directory should be " - "located here:") - M012 = ("Link '{name}' already exists") - M013 = ("To overwrite an existing link, use the --force flag.") - M014 = ("Can't overwrite symlink '{name}'") - M015 = ("This can happen if your data directory contains a directory or " - "file of the same name.") - M016 = ("Error: Couldn't link model to '{name}'") - M017 = ("Creating a symlink in spacy/data failed. Make sure you have the " - "required permissions and try re-running the command as admin, or " - "use a virtualenv. You can still import the model as a module and " - "call its load() method, or create the symlink manually.") - M018 = ("Linking successful") - M019 = ("You can now load the model via spacy.load('{name}')") - M020 = ("Can't find model meta.json") - M021 = ("Couldn't fetch compatibility table.") - M022 = ("Can't find spaCy v{version} in compatibility table") - M023 = ("Installed models (spaCy v{version})") - M024 = ("No models found in your current environment.") - M025 = ("Use the following commands to update the model packages:") - M026 = ("The following models are not available for spaCy " - "v{version}: {models}") - M027 = ("You may also want to overwrite the incompatible links using the " - "`python -m spacy link` command with `--force`, or remove them " - "from the data directory. Data path: {path}") - M028 = ("Input file not found") - M029 = ("Output directory not found") - M030 = ("Unknown format") - M031 = ("Can't find converter for {converter}") - M032 = ("Generated output file {name}") - M033 = ("Created {n_docs} documents") - M034 = ("Evaluation data not found") - M035 = ("Visualization output directory not found") - M036 = ("Generated {n} parses as HTML") - M037 = ("Can't find words frequencies file") - M038 = ("Sucessfully compiled vocab") - M039 = ("{entries} entries, {vectors} vectors") - M040 = ("Output directory not found") - M041 = ("Loaded meta.json from file") - M042 = ("Successfully created package '{name}'") - M043 = ("To build the package, run `python setup.py sdist` in this " - "directory.") - M044 = ("Package directory already exists") - M045 = ("Please delete the directory and try again, or use the `--force` " - "flag to overwrite existing directories.") - M046 = ("Generating meta.json") - M047 = ("Enter the package settings for your model. The following " - "information will be read from your model data: pipeline, vectors.") - M048 = ("No '{key}' setting found in meta.json") - M049 = ("This setting is required to build your package.") - M050 = ("Training data not found") - M051 = ("Development data not found") - M052 = ("Not a valid meta.json format") - M053 = ("Expected dict but got: {meta_type}") - M054 = ("No --lang specified, but tokenization required.") - M055 = ("Training pipeline: {pipeline}") - M056 = ("Starting with base model '{model}'") - M057 = ("Starting with blank model '{model}'") - M058 = ("Loading vector from model '{model}'") - M059 = ("Can't use multitask objective without '{pipe}' in the pipeline") - M060 = ("Counting training words (limit={limit})") - M061 = ("\nSaving model...") - M062 = ("Output directory is not empty.") - M063 = ("Incompatible arguments") - M064 = ("The -f and -c arguments are deprecated, and not compatible with " - "the -j argument, which should specify the same information. " - "Either merge the frequencies and clusters data into the " - "JSONL-formatted file (recommended), or use only the -f and -c " - "files, without the other lexical attributes.") - M065 = ("This can lead to unintended side effects when saving the model. " - "Please use an empty directory or a different path instead. If " - "the specified output path doesn't exist, the directory will be " - "created for you.") - M066 = ("Saved model to output directory") - M067 = ("Can't find lexical data") - M068 = ("Sucessfully compiled vocab and vectors, and saved model") - M069 = ("Unknown file type: '{name}'") - M070 = ("Supported file types: '{options}'") - M071 = ("Loaded pretrained tok2vec for: {components}") - M072 = ("Model language ('{model_lang}') doesn't match language specified " - "as `lang` argument ('{lang}') ") - -# fmt: on diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index a2c1d20e0..ceccb8c8a 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -6,10 +6,8 @@ from pathlib import Path from wasabi import Printer import srsly -from ..compat import path2str from .converters import conllu2json, conllubio2json, iob2json, conll_ner2json from .converters import ner_jsonl2json -from ._messages import Messages # Converters are matched by file extension. To add a converter, add a new @@ -56,18 +54,18 @@ def convert( input_path = Path(input_file) if file_type not in FILE_TYPES: msg.fail( - Messages.M069.format(name=file_type), - Messages.M070.format(options=", ".join(FILE_TYPES)), + "Unknown file type: '{}'".format(file_type), + "Supported file types: '{}'".format(", ".join(FILE_TYPES)), exits=1, ) if not input_path.exists(): - msg.fail(Messages.M028, input_path, exits=1) + msg.fail("Input file not found", input_path, exits=1) if output_dir != "-" and not Path(output_dir).exists(): - msg.fail(Messages.M029, output_dir, exits=1) + msg.fail("Output directory not found", output_dir, exits=1) if converter == "auto": converter = input_path.suffix[1:] if converter not in CONVERTERS: - msg.fail(Messages.M030, Messages.M031.format(converter=converter), exits=1) + msg.fail("Can't find converter for {}".format(converter), exits=1) # Use converter function to convert data func = CONVERTERS[converter] input_data = input_path.open("r", encoding="utf-8").read() @@ -80,10 +78,7 @@ def convert( srsly.write_json(output_file, data) elif file_type == "jsonl": srsly.write_jsonl(output_file, data) - msg.good( - Messages.M032.format(name=path2str(output_file)), - Messages.M033.format(n_docs=len(data)), - ) + msg.good("Generated output file ({} documents)".format(len(data)), output_file) else: # Print to stdout if file_type == "json": diff --git a/spacy/cli/converters/jsonl2json.py b/spacy/cli/converters/jsonl2json.py index a281db86d..796208c1a 100644 --- a/spacy/cli/converters/jsonl2json.py +++ b/spacy/cli/converters/jsonl2json.py @@ -4,12 +4,11 @@ from __future__ import unicode_literals import srsly from ...util import get_lang_class -from .._messages import Messages def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False): if lang is None: - raise ValueError(Messages.M054) + raise ValueError("No --lang specified, but tokenization required") json_docs = [] input_tuples = [srsly.json_loads(line) for line in input_data] nlp = get_lang_class(lang)() diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 06f648124..70acb47fa 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -12,7 +12,6 @@ from ..gold import GoldCorpus, read_json_object from ..util import load_model, get_lang_class # from .schemas import get_schema, validate_json -from ._messages import Messages # Minimum number of expected occurences of label in data to train new label @@ -58,9 +57,9 @@ def debug_data( # Make sure all files and paths exists if they are needed if not train_path.exists(): - msg.fail(Messages.M050, train_path, exits=1) + msg.fail("Training data not found", train_path, exits=1) if not dev_path.exists(): - msg.fail(Messages.M051, dev_path, exits=1) + msg.fail("Development data not found", dev_path, exits=1) # Initialize the model and pipeline pipeline = [p.strip() for p in pipeline.split(",")] @@ -72,10 +71,8 @@ def debug_data( msg.divider("Data format validation") # Load the data in one – might take a while but okay in this case - with msg.loading("Loading {}...".format(train_path.parts[-1])): - train_data = _load_file(train_path, msg) - with msg.loading("Loading {}...".format(dev_path.parts[-1])): - dev_data = _load_file(dev_path, msg) + train_data = _load_file(train_path, msg) + dev_data = _load_file(dev_path, msg) # Validate data format using the JSON schema # TODO: update once the new format is ready @@ -172,6 +169,7 @@ def debug_data( existing_labels = [l for l in labels if l in model_labels] has_low_data_warning = False has_no_neg_warning = False + has_ws_ents_error = False msg.divider("Named Entity Recognition") msg.info( @@ -201,6 +199,10 @@ def debug_data( "Existing: {}".format(_format_labels(existing_labels)), show=verbose ) + if gold_data["ws_ents"]: + msg.fail("{} invalid whitespace entity spans".format(gold_data["ws_ents"])) + has_ws_ents_error = True + for label in new_labels: if label_counts[label] <= NEW_LABEL_THRESHOLD: msg.warn( @@ -222,6 +224,8 @@ def debug_data( msg.good("Good amount of examples for all labels") if not has_no_neg_warning: msg.good("Examples without occurences available for all labels") + if not has_ws_ents_error: + msg.good("No entities consisting of or starting/ending with whitespace") if has_low_data_warning: msg.text( @@ -236,6 +240,11 @@ def debug_data( "type.", show=verbose, ) + if has_ws_ents_error: + msg.text( + "As of spaCy v2.1.0, entity spans consisting of or starting/ending " + "with whitespace characters are considered invalid." + ) if "textcat" in pipeline: msg.divider("Text Classification") @@ -321,11 +330,13 @@ def debug_data( def _load_file(file_path, msg): file_name = file_path.parts[-1] if file_path.suffix == ".json": - data = srsly.read_json(file_path) + with msg.loading("Loading {}...".format(file_name)): + data = srsly.read_json(file_path) msg.good("Loaded {}".format(file_name)) return data elif file_path.suffix == ".jsonl": - data = srsly.read_jsonl(file_path) + with msg.loading("Loading {}...".format(file_name)): + data = srsly.read_jsonl(file_path) msg.good("Loaded {}".format(file_name)) return data msg.fail( @@ -342,6 +353,7 @@ def _compile_gold(train_docs, pipeline): "tags": Counter(), "deps": Counter(), "words": Counter(), + "ws_ents": 0, "n_words": 0, "texts": set(), } @@ -350,7 +362,10 @@ def _compile_gold(train_docs, pipeline): data["n_words"] += len(gold.words) data["texts"].add(doc.text) if "ner" in pipeline: - for label in gold.ner: + for i, label in enumerate(gold.ner): + if label.startswith(("B-", "U-", "L-")) and doc[i].is_space: + # "Illegal" whitespace entity + data["ws_ents"] += 1 if label.startswith(("B-", "U-")): combined_label = label.split("-")[1] data["ner"][combined_label] += 1 @@ -371,18 +386,6 @@ def _format_labels(labels, counts=False): return ", ".join(["'{}'".format(l) for l in labels]) -def _get_ner_counts(data): - counter = Counter() - for doc, gold in data: - for label in gold.ner: - if label.startswith(("B-", "U-")): - combined_label = label.split("-")[1] - counter[combined_label] += 1 - elif label == "-": - counter["-"] += 1 - return counter - - def _get_examples_without_label(data, label): count = 0 for doc, gold in data: diff --git a/spacy/cli/download.py b/spacy/cli/download.py index bc725dd16..922d1c6b3 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -8,7 +8,6 @@ import subprocess import sys from wasabi import Printer -from ._messages import Messages from .link import link from ..util import get_package_path from .. import about @@ -50,15 +49,24 @@ def download(model, direct=False, *pip_args): # Dirty, but since spacy.download and the auto-linking is # mostly a convenience wrapper, it's best to show a success # message and loading instructions, even if linking fails. - msg.warn(Messages.M002.format(name=model_name), Messages.M001) + msg.warn( + "Download successful but linking failed", + "Creating a shortcut link for 'en' didn't work (maybe you " + "don't have admin permissions?), but you can still load the " + "model via its full package name: " + "nlp = spacy.load('{}')".format(model_name), + ) def get_json(url, desc): r = requests.get(url) if r.status_code != 200: msg.fail( - Messages.M003.format(code=r.status_code), - Messages.M004.format(desc=desc, version=about.__version__), + "Server error ({})".format(r.status_code), + "Couldn't fetch {}. Please find a model for your spaCy " + "installation (v{}), and download it manually. For more " + "details, see the documentation: " + "https://spacy.io/usage/models".format(desc, about.__version__), exits=1, ) return r.json() @@ -70,7 +78,7 @@ def get_compatibility(): comp_table = get_json(about.__compatibility__, "compatibility table") comp = comp_table["spacy"] if version not in comp: - msg.fail(Messages.M005, Messages.M006.format(version=version), exits=1) + msg.fail("No compatible models found for v{} of spaCy".format(version), exits=1) return comp[version] @@ -78,8 +86,8 @@ def get_version(model, comp): model = model.rsplit(".dev", 1)[0] if model not in comp: msg.fail( - Messages.M005, - Messages.M007.format(name=model, version=about.__version__), + "No compatible model found for '{}' " + "(spaCy v{}).".format(model, about.__version__), exits=1, ) return comp[model][0] diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 459c1419b..1ca0531fd 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -5,7 +5,6 @@ import plac from timeit import default_timer as timer from wasabi import Printer -from ._messages import Messages from ..gold import GoldCorpus from .. import util from .. import displacy @@ -39,9 +38,9 @@ def evaluate( data_path = util.ensure_path(data_path) displacy_path = util.ensure_path(displacy_path) if not data_path.exists(): - msg.fail(Messages.M034, data_path, exits=1) + msg.fail("Evaluation data not found", data_path, exits=1) if displacy_path and not displacy_path.exists(): - msg.fail(Messages.M035, displacy_path, exits=1) + msg.fail("Visualization output directory not found", displacy_path, exits=1) corpus = GoldCorpus(data_path, data_path) nlp = util.load_model(model) dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc)) @@ -75,7 +74,7 @@ def evaluate( deps=render_deps, ents=render_ents, ) - msg.good(Messages.M036.format(n=displacy_limit), displacy_path) + msg.good("Generated {} parses as HTML".format(displacy_limit), displacy_path) def render_parses(docs, output_path, model_name="", limit=250, deps=True, ents=True): @@ -90,39 +89,3 @@ def render_parses(docs, output_path, model_name="", limit=250, deps=True, ents=T docs[:limit], style="dep", page=True, options={"compact": True} ) file_.write(html) - - -def print_progress(itn, losses, dev_scores, wps=0.0): - scores = {} - for col in [ - "dep_loss", - "tag_loss", - "uas", - "tags_acc", - "token_acc", - "ents_p", - "ents_r", - "ents_f", - "wps", - ]: - scores[col] = 0.0 - scores["dep_loss"] = losses.get("parser", 0.0) - scores["ner_loss"] = losses.get("ner", 0.0) - scores["tag_loss"] = losses.get("tagger", 0.0) - scores.update(dev_scores) - scores["wps"] = wps - tpl = "\t".join( - ( - "{:d}", - "{dep_loss:.3f}", - "{ner_loss:.3f}", - "{uas:.3f}", - "{ents_p:.3f}", - "{ents_r:.3f}", - "{ents_f:.3f}", - "{tags_acc:.3f}", - "{token_acc:.3f}", - "{wps:.1f}", - ) - ) - print(tpl.format(itn, **scores)) diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 7339faaab..3655327ef 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -7,7 +7,6 @@ from pathlib import Path from wasabi import Printer import srsly -from ._messages import Messages from ..compat import path2str, basestring_, unicode_ from .. import util from .. import about @@ -32,7 +31,7 @@ def info(model=None, markdown=False, silent=False): model_path = util.get_data_path() / model meta_path = model_path / "meta.json" if not meta_path.is_file(): - msg.fail(Messages.M020, meta_path, exits=1) + msg.fail("Can't find model meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) if model_path.resolve() != model_path: meta["link"] = path2str(model_path) diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 0a8570a7b..08965d387 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -14,7 +14,6 @@ import zipfile import srsly from wasabi import Printer -from ._messages import Messages from ..vectors import Vectors from ..errors import Errors, Warnings, user_warning from ..util import ensure_path, get_lang_class @@ -58,14 +57,21 @@ def init_model( settings.append("-f") if clusters_loc: settings.append("-c") - msg.warn(Messages.M063, Messages.M064) + msg.warn( + "Incompatible arguments", + "The -f and -c arguments are deprecated, and not compatible " + "with the -j argument, which should specify the same " + "information. Either merge the frequencies and clusters data " + "into the JSONL-formatted file (recommended), or use only the " + "-f and -c files, without the other lexical attributes.", + ) jsonl_loc = ensure_path(jsonl_loc) lex_attrs = srsly.read_jsonl(jsonl_loc) else: clusters_loc = ensure_path(clusters_loc) freqs_loc = ensure_path(freqs_loc) if freqs_loc is not None and not freqs_loc.exists(): - msg.fail(Messages.M037, freqs_loc, exits=1) + msg.fail("Can't find words frequencies file", freqs_loc, exits=1) lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc) with msg.loading("Creating model..."): @@ -75,7 +81,10 @@ def init_model( add_vectors(nlp, vectors_loc, prune_vectors) vec_added = len(nlp.vocab.vectors) lex_added = len(nlp.vocab) - msg.good(Messages.M038, Messages.M039.format(entries=lex_added, vectors=vec_added)) + msg.good( + "Sucessfully compiled vocab", + "{} entries, {} vectors".format(lex_added, vec_added), + ) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) diff --git a/spacy/cli/link.py b/spacy/cli/link.py index 6172dad07..6b719ffe6 100644 --- a/spacy/cli/link.py +++ b/spacy/cli/link.py @@ -5,7 +5,6 @@ import plac from pathlib import Path from wasabi import Printer -from ._messages import Messages from ..compat import symlink_to, path2str from .. import util @@ -28,29 +27,52 @@ def link(origin, link_name, force=False, model_path=None): model_path = Path(origin) if model_path is None else Path(model_path) if not model_path.exists(): msg.fail( - Messages.M008, Messages.M009.format(path=path2str(model_path)), exits=1 + "Can't locate model data", + "The data should be located in {}".format(path2str(model_path)), + exits=1, ) data_path = util.get_data_path() if not data_path or not data_path.exists(): spacy_loc = Path(__file__).parent.parent - msg.fail(Messages.M010, Messages.M011.format(path=spacy_loc), exits=1) + msg.fail( + "Can't find the spaCy data path to create model symlink", + "Make sure a directory `/data` exists within your spaCy " + "installation and try again. The data directory should be located " + "here:".format(path=spacy_loc), + exits=1, + ) link_path = util.get_data_path() / link_name if link_path.is_symlink() and not force: - msg.fail(Messages.M012.format(name=link_name), Messages.M013, exits=1) + msg.fail( + "Link '{}' already exists".format(link_name), + "To overwrite an existing link, use the --force flag", + exits=1, + ) elif link_path.is_symlink(): # does a symlink exist? # NB: It's important to check for is_symlink here and not for exists, # because invalid/outdated symlinks would return False otherwise. link_path.unlink() elif link_path.exists(): # does it exist otherwise? # NB: Check this last because valid symlinks also "exist". - msg.fail(Messages.M014.format(name=link_name), Messages.M015, exits=1) + msg.fail( + "Can't overwrite symlink '{}'".format(link_name), + "This can happen if your data directory contains a directory or " + "file of the same name.", + exits=1, + ) details = "%s --> %s" % (path2str(model_path), path2str(link_path)) try: symlink_to(link_path, model_path) except: # noqa: E722 # This is quite dirty, but just making sure other errors are caught. - msg.fail(Messages.M016.format(name=link_name), Messages.M017) + msg.fail( + "Couldn't link model to '{}'".format(link_name), + "Creating a symlink in spacy/data failed. Make sure you have the " + "required permissions and try re-running the command as admin, or " + "use a virtualenv. You can still import the model as a module and " + "call its load() method, or create the symlink manually.", + ) msg.text(details) raise - msg.good(Messages.M018, details) - msg.text(Messages.M019.format(name=link_name)) + msg.good("Linking successful", details) + msg.text("You can now load the model via spacy.load('{}')".format(link_name)) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 916dbc1f2..88b93c9af 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -7,7 +7,6 @@ from pathlib import Path from wasabi import Printer, get_raw_input import srsly -from ._messages import Messages from ..compat import path2str from .. import util from .. import about @@ -33,22 +32,26 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals output_path = util.ensure_path(output_dir) meta_path = util.ensure_path(meta_path) if not input_path or not input_path.exists(): - msg.fail(Messages.M008, input_path, exits=1) + msg.fail("Can't locate model data", input_path, exits=1) if not output_path or not output_path.exists(): - msg.fail(Messages.M040, output_path, exits=1) + msg.fail("Output directory not found", output_path, exits=1) if meta_path and not meta_path.exists(): - msg.fail(Messages.M020, meta_path, exits=1) + msg.fail("Can't find model meta.json", meta_path, exits=1) meta_path = meta_path or input_path / "meta.json" if meta_path.is_file(): meta = srsly.read_json(meta_path) if not create_meta: # only print if user doesn't want to overwrite - msg.good(Messages.M041, meta_path) + msg.good("Loaded meta.json from file", meta_path) else: meta = generate_meta(input_dir, meta, msg) for key in ("lang", "name", "version"): if key not in meta or meta[key] == "": - msg.fail(Messages.M048.format(key=key), Messages.M049, exits=1) + msg.fail( + "No '{}' setting found in meta.json".format(key), + "This setting is required to build your package.", + exits=1, + ) model_name = meta["lang"] + "_" + meta["name"] model_name_v = model_name + "-" + meta["version"] main_path = output_path / model_name_v @@ -59,8 +62,10 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals shutil.rmtree(path2str(package_path)) else: msg.fail( - Messages.M044, - Messages.M045.format(path=path2str(package_path)), + "Package directory already exists", + "Please delete the directory and try again, or use the " + "`--force` flag to overwrite existing " + "directories.".format(path=path2str(package_path)), exits=1, ) Path.mkdir(package_path, parents=True) @@ -69,8 +74,8 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals create_file(main_path / "setup.py", TEMPLATE_SETUP) create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST) create_file(package_path / "__init__.py", TEMPLATE_INIT) - msg.good(Messages.M042.format(name=model_name_v), main_path) - msg.text(Messages.M043) + msg.good("Successfully created package '{}'".format(model_name_v), main_path) + msg.text("To build the package, run `python setup.py sdist` in this directory.") def create_file(file_path, contents): @@ -98,8 +103,11 @@ def generate_meta(model_path, existing_meta, msg): "vectors": len(nlp.vocab.vectors), "keys": nlp.vocab.vectors.n_keys, } - msg.divider(Messages.M046) - msg.text(Messages.M047) + msg.divider("Generating meta.json") + msg.text( + "Enter the package settings for your model. The following information " + "will be read from your model data: pipeline, vectors." + ) for setting, desc, default in settings: response = get_raw_input(desc, default) meta[setting] = default if response == "" and default else response diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 90decdc12..70a288a97 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -10,7 +10,6 @@ import shutil import srsly from wasabi import Printer -from ._messages import Messages from .._ml import create_default_optimizer from ..attrs import PROB, IS_OOV, CLUSTER, LANG from ..gold import GoldCorpus @@ -91,16 +90,20 @@ def train( dev_path = util.ensure_path(dev_path) meta_path = util.ensure_path(meta_path) if not train_path or not train_path.exists(): - msg.fail(Messages.M050, train_path, exits=1) + msg.fail("Training data not found", train_path, exits=1) if not dev_path or not dev_path.exists(): - msg.fail(Messages.M051, dev_path, exits=1) + msg.fail("Development data not found", dev_path, exits=1) if meta_path is not None and not meta_path.exists(): - msg.fail(Messages.M020, meta_path, exits=1) + msg.fail("Can't find model meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) if meta_path else {} - if not isinstance(meta, dict): - msg.fail(Messages.M052, Messages.M053.format(meta_type=type(meta)), exits=1) if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]: - msg.fail(Messages.M062, Messages.M065) + msg.warn( + "Output directory is not empty", + "This can lead to unintended side effects when saving the model. " + "Please use an empty directory or a different path instead. If " + "the specified output path doesn't exist, the directory will be " + "created for you.", + ) if not output_path.exists(): output_path.mkdir() @@ -123,19 +126,23 @@ def train( # the model and make sure the pipeline matches the pipeline setting. If # training starts from a blank model, intitalize the language class. pipeline = [p.strip() for p in pipeline.split(",")] - msg.text(Messages.M055.format(pipeline=pipeline)) + msg.text("Training pipeline: {}".format(pipeline)) if base_model: - msg.text(Messages.M056.format(model=base_model)) + msg.text("Starting with base model '{}'".format(base_model)) nlp = util.load_model(base_model) if nlp.lang != lang: - msg.fail(Messages.M072.format(model_lang=nlp.lang, lang=lang), exits=1) + msg.fail( + "Model language ('{}') doesn't match language specified as " + "`lang` argument ('{}') ".format(nlp.lang, lang), + exits=1, + ) other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipeline] nlp.disable_pipes(*other_pipes) for pipe in pipeline: if pipe not in nlp.pipe_names: nlp.add_pipe(nlp.create_pipe(pipe)) else: - msg.text(Messages.M057.format(model=lang)) + msg.text("Starting with blank model '{}'".format(lang)) lang_cls = util.get_lang_class(lang) nlp = lang_cls() for pipe in pipeline: @@ -145,7 +152,7 @@ def train( nlp.add_pipe(nlp.create_pipe("merge_subtokens")) if vectors: - msg.text(Messages.M058.format(model=vectors)) + msg.text("Loading vector from model '{}'".format(vectors)) _load_vectors(nlp, vectors) # Multitask objectives @@ -153,13 +160,16 @@ def train( for pipe_name, multitasks in multitask_options: if multitasks: if pipe_name not in pipeline: - msg.fail(Messages.M059.format(pipe=pipe_name)) + msg.fail( + "Can't use multitask objective without '{}' in the " + "pipeline".format(pipe_name) + ) pipe = nlp.get_pipe(pipe_name) for objective in multitasks.split(","): pipe.add_multitask_objective(objective) # Prepare training corpus - msg.text(Messages.M060.format(limit=n_examples)) + msg.text("Counting training words (limit={})".format(n_examples)) corpus = GoldCorpus(train_path, dev_path, limit=n_examples) n_train_words = corpus.count_train() @@ -175,11 +185,19 @@ def train( # Load in pre-trained weights if init_tok2vec is not None: components = _load_pretrained_tok2vec(nlp, init_tok2vec) - msg.text(Messages.M071.format(components=components)) + msg.text("Loaded pretrained tok2vec for: {}".format(components)) - print( - "\nItn. Dep Loss NER Loss UAS NER P. NER R. NER F. Tag % Token % CPU WPS GPU WPS" - ) + # fmt: off + row_head = ("Itn", "Dep Loss", "NER Loss", "UAS", "NER P", "NER R", "NER F", "Tag %", "Token %", "CPU WPS", "GPU WPS") + row_settings = { + "widths": (3, 10, 10, 7, 7, 7, 7, 7, 7, 7, 7), + "aligns": ["r" for i in row_head], + "spacing": 2 + } + # fmt: on + print("") + msg.row(row_head, **row_settings) + msg.row(["-" * width for width in row_settings["widths"]], **row_settings) try: for i in range(n_iter): train_docs = corpus.train_docs( @@ -246,15 +264,18 @@ def train( util.set_env_log(verbose) - print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps) + progress = _get_progress( + i, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps + ) + msg.row(progress, **row_settings) finally: - with msg.loading(Messages.M061): - with nlp.use_params(optimizer.averages): - final_model_path = output_path / "model-final" - nlp.to_disk(final_model_path) - msg.good(Messages.M066, util.path2str(final_model_path)) - - _collate_best_model(meta, output_path, nlp.pipe_names) + with nlp.use_params(optimizer.averages): + final_model_path = output_path / "model-final" + nlp.to_disk(final_model_path) + msg.good("Saved model to output directory", final_model_path) + with msg.loading("Creating best model..."): + best_model_path = _collate_best_model(meta, output_path, nlp.pipe_names) + msg.good("Created best model", best_model_path) def _load_vectors(nlp, vectors): @@ -297,6 +318,7 @@ def _collate_best_model(meta, output_path, components): for metric in _get_metrics(component): meta["accuracy"][metric] = accs[metric] srsly.write_json(best_dest / "meta.json", meta) + return best_dest def _find_best(experiment_dir, component): @@ -322,7 +344,7 @@ def _get_metrics(component): return ("token_acc",) -def print_progress(itn, losses, dev_scores, cpu_wps=0.0, gpu_wps=0.0): +def _get_progress(itn, losses, dev_scores, cpu_wps=0.0, gpu_wps=0.0): scores = {} for col in [ "dep_loss", @@ -343,19 +365,16 @@ def print_progress(itn, losses, dev_scores, cpu_wps=0.0, gpu_wps=0.0): scores.update(dev_scores) scores["cpu_wps"] = cpu_wps scores["gpu_wps"] = gpu_wps or 0.0 - tpl = "".join( - ( - "{:<6d}", - "{dep_loss:<10.3f}", - "{ner_loss:<10.3f}", - "{uas:<8.3f}", - "{ents_p:<8.3f}", - "{ents_r:<8.3f}", - "{ents_f:<8.3f}", - "{tags_acc:<8.3f}", - "{token_acc:<9.3f}", - "{cpu_wps:<9.1f}", - "{gpu_wps:.1f}", - ) - ) - print(tpl.format(itn, **scores)) + return [ + itn, + "{:.3f}".format(scores["dep_loss"]), + "{:.3f}".format(scores["ner_loss"]), + "{:.3f}".format(scores["uas"]), + "{:.3f}".format(scores["ents_p"]), + "{:.3f}".format(scores["ents_r"]), + "{:.3f}".format(scores["ents_f"]), + "{:.3f}".format(scores["tags_acc"]), + "{:.3f}".format(scores["token_acc"]), + "{:.0f}".format(scores["cpu_wps"]), + "{:.0f}".format(scores["gpu_wps"]), + ] diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py index 4b5581972..20915caed 100644 --- a/spacy/cli/validate.py +++ b/spacy/cli/validate.py @@ -8,7 +8,6 @@ import requests import srsly from wasabi import Printer -from ._messages import Messages from ..compat import path2str from ..util import get_data_path from .. import about @@ -23,13 +22,17 @@ def validate(): with msg.loading("Loading compatibility table..."): r = requests.get(about.__compatibility__) if r.status_code != 200: - msg.fail(Messages.M003.format(code=r.status_code), Messages.M021, exits=1) + msg.fail( + "Server error ({})".format(r.status_code), + "Couldn't fetch compatibility table.", + exits=1, + ) msg.good("Loaded compatibility table") compat = r.json()["spacy"] current_compat = compat.get(about.__version__) if not current_compat: msg.fail( - Messages.M022.format(version=about.__version__), + "Can't find spaCy v{} in compatibility table".format(about.__version__), about.__compatibility__, exits=1, ) @@ -49,7 +52,7 @@ def validate(): update_models = [m for m in incompat_models if m in current_compat] spacy_dir = Path(__file__).parent.parent - msg.divider(Messages.M023.format(version=about.__version__)) + msg.divider("Installed models (spaCy v{})".format(about.__version__)) msg.info("spaCy installation: {}".format(path2str(spacy_dir))) if model_links or model_pkgs: @@ -61,17 +64,24 @@ def validate(): rows.append(get_model_row(current_compat, name, data, msg, "link")) msg.table(rows, header=header) else: - msg.text(Messages.M024, exits=0) + msg.text("No models found in your current environment.", exits=0) if update_models: msg.divider("Install updates") + msg.text("Use the following commands to update the model packages:") cmd = "python -m spacy download {}" print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n") if na_models: msg.text( - Messages.M025.format(version=about.__version__, models=", ".join(na_models)) + "The following models are not available for spaCy " + "v{}: {}".format(about.__version__, ", ".join(na_models)) ) if incompat_links: - msg.text(Messages.M027.format(path=path2str(get_data_path()))) + msg.text( + "You may also want to overwrite the incompatible links using the " + "`python -m spacy link` command with `--force`, or remove them " + "from the data directory. " + "Data path: {path}".format(path=path2str(get_data_path())) + ) if incompat_models or incompat_links: sys.exit(1) From 11a29af751676297824b39b54b7ffd44d1e102b6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 8 Dec 2018 12:37:38 +0100 Subject: [PATCH 07/11] Set cupy.random seed in fix_random_seed helper --- spacy/util.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/spacy/util.py b/spacy/util.py index 0a682fcaa..ea662d3a3 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -15,6 +15,11 @@ import itertools import numpy.random import srsly +try: + import cupy.random +except ImportError: + cupy = None + from .symbols import ORTH from .compat import cupy, CudaStream, path2str, basestring_, unicode_ from .compat import import_file @@ -598,6 +603,8 @@ def use_gpu(gpu_id): def fix_random_seed(seed=0): random.seed(seed) numpy.random.seed(seed) + if cupy is not None: + cupy.random.seed(seed) class SimpleFrozenDict(dict): From 2c2db0c492213e0dec3e5c4493b60e686c8c0096 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 8 Dec 2018 13:08:41 +0100 Subject: [PATCH 08/11] =?UTF-8?q?=F0=9F=92=AB=20Allow=20Span=20to=20take?= =?UTF-8?q?=20text=20label=20(#3031)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes #3027. * Allow Span.__init__ to take unicode values for the `label` argument. * Allow `Span.label_` to be writeable. - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information. --- spacy/tests/doc/test_span.py | 13 ++++++++++++- spacy/tokens/span.pyx | 8 ++++++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index 6fd22b3ff..11fc0f228 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import pytest from spacy.attrs import ORTH, LENGTH -from spacy.tokens import Doc +from spacy.tokens import Doc, Span from spacy.vocab import Vocab from ..util import get_doc @@ -154,6 +154,17 @@ def test_span_as_doc(doc): assert span.text == span_doc.text.strip() +def test_span_string_label(doc): + span = Span(doc, 0, 1, label='hello') + assert span.label_ == 'hello' + assert span.label == doc.vocab.strings['hello'] + +def test_span_string_set_label(doc): + span = Span(doc, 0, 1) + span.label_ = 'hello' + assert span.label_ == 'hello' + assert span.label == doc.vocab.strings['hello'] + def test_span_ents_property(doc): """Test span.ents for the """ doc.ents = [ diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 6cba46a22..29082b894 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -15,7 +15,7 @@ from ..parts_of_speech cimport univ_pos_t from ..util import normalize_slice from ..attrs cimport IS_PUNCT, IS_SPACE from ..lexeme cimport Lexeme -from ..compat import is_config +from ..compat import is_config, basestring_ from ..errors import Errors, TempErrors, Warnings, user_warning, models_warning from .underscore import Underscore, get_ext_args @@ -42,7 +42,7 @@ cdef class Span: raise ValueError(Errors.E046.format(name=name)) return Underscore.span_extensions.pop(name) - def __cinit__(self, Doc doc, int start, int end, attr_t label=0, + def __cinit__(self, Doc doc, int start, int end, label=0, vector=None, vector_norm=None): """Create a `Span` object from the slice `doc[start : end]`. @@ -64,6 +64,8 @@ cdef class Span: self.end_char = self.doc[end - 1].idx + len(self.doc[end - 1]) else: self.end_char = 0 + if isinstance(label, basestring_): + label = doc.vocab.strings.add(label) if label not in doc.vocab.strings: raise ValueError(Errors.E084.format(label=label)) self.label = label @@ -601,6 +603,8 @@ cdef class Span: """RETURNS (unicode): The span's label.""" def __get__(self): return self.doc.vocab.strings[self.label] + def __set__(self, unicode label_): + self.label = self.doc.vocab.strings.add(label_) cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1: From cb16b78b0d3e7fc91eb0e7c8da725e4d5780aec5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 8 Dec 2018 19:59:11 +0100 Subject: [PATCH 09/11] Set dropout rate to 0.2 --- spacy/cli/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 70a288a97..66e5c14d5 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -112,8 +112,8 @@ def train( # Batch size starts at 1 and grows, so that we make updates quickly # at the beginning of training. dropout_rates = util.decaying( - util.env_opt("dropout_from", 0.1), - util.env_opt("dropout_to", 0.1), + util.env_opt("dropout_from", 0.2), + util.env_opt("dropout_to", 0.2), util.env_opt("dropout_decay", 0.0), ) batch_sizes = util.compounding( From d2ac618af103b9c4396d91501c858357f8f60ebe Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 8 Dec 2018 23:27:29 +0100 Subject: [PATCH 10/11] Set cbb_maxout_pieces=3 --- spacy/_ml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index 3df9d72ba..c7c90398b 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -271,7 +271,7 @@ def PyTorchBiLSTM(nO, nI, depth, dropout=0.2): def Tok2Vec(width, embed_size, **kwargs): pretrained_vectors = kwargs.get("pretrained_vectors", None) - cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 2) + cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3) subword_features = kwargs.get("subword_features", True) conv_depth = kwargs.get("conv_depth", 4) bilstm_depth = kwargs.get("bilstm_depth", 0) From 1b1a1af19373a30a8b07f0b6eebe0217566d8ff8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 9 Dec 2018 06:03:49 +0100 Subject: [PATCH 11/11] Fix printing in spacy train --- spacy/cli/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 66e5c14d5..f29beacc9 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -191,7 +191,7 @@ def train( row_head = ("Itn", "Dep Loss", "NER Loss", "UAS", "NER P", "NER R", "NER F", "Tag %", "Token %", "CPU WPS", "GPU WPS") row_settings = { "widths": (3, 10, 10, 7, 7, 7, 7, 7, 7, 7, 7), - "aligns": ["r" for i in row_head], + "aligns": tuple(["r" for i in row_head]), "spacing": 2 } # fmt: on