From 12158c1e3a96d4d17cf4ecfe36b899924200f3de Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 16 Dec 2019 13:12:19 +0100 Subject: [PATCH 1/2] Restore tqdm imports (#4804) * set 4.38.0 to minimal version with color bug fix * set imports back to proper place * add upper range for tqdm --- bin/ud/ud_train.py | 4 +--- bin/wiki_entity_linking/entity_linker_evaluation.py | 3 +-- bin/wiki_entity_linking/wikipedia_processor.py | 4 +--- examples/training/conllu.py | 4 +--- examples/training/pretrain_textcat.py | 7 +------ examples/vectors_tensorboard.py | 4 +--- requirements.txt | 1 + spacy/cli/init_model.py | 13 +------------ spacy/cli/profile.py | 4 +--- spacy/cli/train.py | 8 +------- 10 files changed, 10 insertions(+), 42 deletions(-) diff --git a/bin/ud/ud_train.py b/bin/ud/ud_train.py index ddd87a31c..6353bd6e7 100644 --- a/bin/ud/ud_train.py +++ b/bin/ud/ud_train.py @@ -8,6 +8,7 @@ import plac from pathlib import Path import re import json +import tqdm import spacy import spacy.util @@ -486,9 +487,6 @@ def main( vectors_dir=None, use_oracle_segments=False, ): - # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 - import tqdm - Token.set_extension("get_conllu_lines", method=get_token_conllu) Token.set_extension("begins_fused", default=False) Token.set_extension("inside_fused", default=False) diff --git a/bin/wiki_entity_linking/entity_linker_evaluation.py b/bin/wiki_entity_linking/entity_linker_evaluation.py index 94bafbf30..273ade0cd 100644 --- a/bin/wiki_entity_linking/entity_linker_evaluation.py +++ b/bin/wiki_entity_linking/entity_linker_evaluation.py @@ -1,6 +1,7 @@ import logging import random +from tqdm import tqdm from collections import defaultdict logger = logging.getLogger(__name__) @@ -119,8 +120,6 @@ def get_eval_results(data, el_pipe=None): Only evaluate entities that overlap between gold and NER, to isolate the performance of the NEL. If the docs in the data require further processing with an entity linker, set el_pipe. """ - from tqdm import tqdm - docs = [] golds = [] for d, g in tqdm(data, leave=False): diff --git a/bin/wiki_entity_linking/wikipedia_processor.py b/bin/wiki_entity_linking/wikipedia_processor.py index 25e914b32..19df0cf10 100644 --- a/bin/wiki_entity_linking/wikipedia_processor.py +++ b/bin/wiki_entity_linking/wikipedia_processor.py @@ -6,6 +6,7 @@ import bz2 import logging import random import json +from tqdm import tqdm from functools import partial @@ -457,9 +458,6 @@ def read_training(nlp, entity_file_path, dev, limit, kb, labels_discard=None): """ This method provides training examples that correspond to the entity annotations found by the nlp object. For training, it will include both positive and negative examples by using the candidate generator from the kb. For testing (kb=None), it will include all positive examples only.""" - - from tqdm import tqdm - if not labels_discard: labels_discard = [] diff --git a/examples/training/conllu.py b/examples/training/conllu.py index d9ee721ec..1c65f4a72 100644 --- a/examples/training/conllu.py +++ b/examples/training/conllu.py @@ -7,6 +7,7 @@ import attr from pathlib import Path import re import json +import tqdm import spacy import spacy.util @@ -386,9 +387,6 @@ class TreebankPaths(object): limit=("Size limit", "option", "n", int), ) def main(ud_dir, parses_dir, config, corpus, limit=0): - # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 - import tqdm - Token.set_extension("get_conllu_lines", method=get_token_conllu) Token.set_extension("begins_fused", default=False) Token.set_extension("inside_fused", default=False) diff --git a/examples/training/pretrain_textcat.py b/examples/training/pretrain_textcat.py index e45f3345e..00cbd992c 100644 --- a/examples/training/pretrain_textcat.py +++ b/examples/training/pretrain_textcat.py @@ -14,6 +14,7 @@ pre-train with the development data, but also not *so* terrible: we're not using the development labels, after all --- only the unlabelled text. """ import plac +import tqdm import random import spacy import thinc.extra.datasets @@ -106,9 +107,6 @@ def create_pipeline(width, embed_size, vectors_model): def train_tensorizer(nlp, texts, dropout, n_iter): - # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 - import tqdm - tensorizer = nlp.create_pipe("tensorizer") nlp.add_pipe(tensorizer) optimizer = nlp.begin_training() @@ -122,9 +120,6 @@ def train_tensorizer(nlp, texts, dropout, n_iter): def train_textcat(nlp, n_texts, n_iter=10): - # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 - import tqdm - textcat = nlp.get_pipe("textcat") tok2vec_weights = textcat.model.tok2vec.to_bytes() (train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts) diff --git a/examples/vectors_tensorboard.py b/examples/vectors_tensorboard.py index b1160888d..72eda1edc 100644 --- a/examples/vectors_tensorboard.py +++ b/examples/vectors_tensorboard.py @@ -8,6 +8,7 @@ from __future__ import unicode_literals from os import path +import tqdm import math import numpy import plac @@ -35,9 +36,6 @@ from tensorflow.contrib.tensorboard.plugins.projector import ( ), ) def main(vectors_loc, out_loc, name="spaCy_vectors"): - # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 - import tqdm - meta_file = "{}.tsv".format(name) out_meta_file = path.join(out_loc, meta_file) diff --git a/requirements.txt b/requirements.txt index f208a2772..1786ee186 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,6 +12,7 @@ numpy>=1.15.0 requests>=2.13.0,<3.0.0 plac>=0.9.6,<1.2.0 pathlib==1.0.1; python_version < "3.4" +tqdm>=4.38.0,<5.0.0 # Optional dependencies jsonschema>=2.6.0,<3.1.0 # Development dependencies diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index cda21cbcc..3fa0cc890 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import plac import math +from tqdm import tqdm import numpy from ast import literal_eval from pathlib import Path @@ -116,9 +117,6 @@ def open_file(loc): def read_attrs_from_deprecated(freqs_loc, clusters_loc): - # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 - from tqdm import tqdm - if freqs_loc is not None: with msg.loading("Counting frequencies..."): probs, _ = read_freqs(freqs_loc) @@ -201,9 +199,6 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None): def read_vectors(vectors_loc): - # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 - from tqdm import tqdm - f = open_file(vectors_loc) shape = tuple(int(size) for size in next(f).split()) vectors_data = numpy.zeros(shape=shape, dtype="f") @@ -220,9 +215,6 @@ def read_vectors(vectors_loc): def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50): - # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 - from tqdm import tqdm - counts = PreshCounter() total = 0 with freqs_loc.open() as f: @@ -252,9 +244,6 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50): def read_clusters(clusters_loc): - # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 - from tqdm import tqdm - clusters = {} if ftfy is None: user_warning(Warnings.W004) diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py index 4995224f3..4ee72fc23 100644 --- a/spacy/cli/profile.py +++ b/spacy/cli/profile.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals, division, print_function import plac +import tqdm from pathlib import Path import srsly import cProfile @@ -46,9 +47,6 @@ def profile(model, inputs=None, n_texts=10000): def parse_texts(nlp, texts): - # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 - import tqdm - for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16): pass diff --git a/spacy/cli/train.py b/spacy/cli/train.py index d1fbdd179..7c4298ca3 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals, division, print_function import plac import os +import tqdm from pathlib import Path from thinc.neural._classes.model import Model from timeit import default_timer as timer @@ -85,10 +86,6 @@ def train( JSON format. To convert data from other formats, use the `spacy convert` command. """ - - # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 - import tqdm - util.fix_random_seed() util.set_env_log(verbose) @@ -516,9 +513,6 @@ def _score_for_model(meta): @contextlib.contextmanager def _create_progress_bar(total): - # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 - import tqdm - if int(os.environ.get("LOG_FRIENDLY", 0)): yield else: From 8ebbb85117ddf7f600e4e40225629e910db4a4a6 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Fri, 20 Dec 2019 23:00:04 +0100 Subject: [PATCH 2/2] Documentation for PhraseMatcher constructor (#4826) * add max_length as argument for init PhraseMatcher * improve error message too --- spacy/errors.py | 4 +++- website/docs/api/phrasematcher.md | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/spacy/errors.py b/spacy/errors.py index 4dcdcae1a..dd2b38eb9 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -53,7 +53,9 @@ class Warnings(object): W009 = ("Custom factory '{name}' provided by entry points of another " "package overwrites built-in factory.") W010 = ("As of v2.1.0, the PhraseMatcher doesn't have a phrase length " - "limit anymore, so the max_length argument is now deprecated.") + "limit anymore, so the max_length argument is now deprecated. " + "If you did not specify this parameter, make sure you call the " + "constructor with named arguments instead of positional ones.") W011 = ("It looks like you're calling displacy.serve from within a " "Jupyter notebook or a similar environment. This likely means " "you're already running a local web server, so there's no need to " diff --git a/website/docs/api/phrasematcher.md b/website/docs/api/phrasematcher.md index c7311a401..90ecd3416 100644 --- a/website/docs/api/phrasematcher.md +++ b/website/docs/api/phrasematcher.md @@ -38,6 +38,7 @@ be shown. | Name | Type | Description | | --------------------------------------- | --------------- | ------------------------------------------------------------------------------------------- | | `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. | +| `max_length` | int | Deprecated argument - the `PhraseMatcher` does not have a phrase length limit anymore. | | `attr` 2.1 | int / unicode | The token attribute to match on. Defaults to `ORTH`, i.e. the verbatim token text. | | `validate` 2.1 | bool | Validate patterns added to the matcher. | | **RETURNS** | `PhraseMatcher` | The newly constructed object. |