Merge branch 'master' into spacy.io

This commit is contained in:
Ines Montani 2019-12-20 23:00:31 +01:00
commit 1bb11953e8
12 changed files with 14 additions and 43 deletions

View File

@ -8,6 +8,7 @@ import plac
from pathlib import Path from pathlib import Path
import re import re
import json import json
import tqdm
import spacy import spacy
import spacy.util import spacy.util
@ -486,9 +487,6 @@ def main(
vectors_dir=None, vectors_dir=None,
use_oracle_segments=False, use_oracle_segments=False,
): ):
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
import tqdm
Token.set_extension("get_conllu_lines", method=get_token_conllu) Token.set_extension("get_conllu_lines", method=get_token_conllu)
Token.set_extension("begins_fused", default=False) Token.set_extension("begins_fused", default=False)
Token.set_extension("inside_fused", default=False) Token.set_extension("inside_fused", default=False)

View File

@ -1,6 +1,7 @@
import logging import logging
import random import random
from tqdm import tqdm
from collections import defaultdict from collections import defaultdict
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -119,8 +120,6 @@ def get_eval_results(data, el_pipe=None):
Only evaluate entities that overlap between gold and NER, to isolate the performance of the NEL. Only evaluate entities that overlap between gold and NER, to isolate the performance of the NEL.
If the docs in the data require further processing with an entity linker, set el_pipe. If the docs in the data require further processing with an entity linker, set el_pipe.
""" """
from tqdm import tqdm
docs = [] docs = []
golds = [] golds = []
for d, g in tqdm(data, leave=False): for d, g in tqdm(data, leave=False):

View File

@ -6,6 +6,7 @@ import bz2
import logging import logging
import random import random
import json import json
from tqdm import tqdm
from functools import partial from functools import partial
@ -457,9 +458,6 @@ def read_training(nlp, entity_file_path, dev, limit, kb, labels_discard=None):
""" This method provides training examples that correspond to the entity annotations found by the nlp object. """ This method provides training examples that correspond to the entity annotations found by the nlp object.
For training, it will include both positive and negative examples by using the candidate generator from the kb. For training, it will include both positive and negative examples by using the candidate generator from the kb.
For testing (kb=None), it will include all positive examples only.""" For testing (kb=None), it will include all positive examples only."""
from tqdm import tqdm
if not labels_discard: if not labels_discard:
labels_discard = [] labels_discard = []

View File

@ -7,6 +7,7 @@ import attr
from pathlib import Path from pathlib import Path
import re import re
import json import json
import tqdm
import spacy import spacy
import spacy.util import spacy.util
@ -386,9 +387,6 @@ class TreebankPaths(object):
limit=("Size limit", "option", "n", int), limit=("Size limit", "option", "n", int),
) )
def main(ud_dir, parses_dir, config, corpus, limit=0): def main(ud_dir, parses_dir, config, corpus, limit=0):
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
import tqdm
Token.set_extension("get_conllu_lines", method=get_token_conllu) Token.set_extension("get_conllu_lines", method=get_token_conllu)
Token.set_extension("begins_fused", default=False) Token.set_extension("begins_fused", default=False)
Token.set_extension("inside_fused", default=False) Token.set_extension("inside_fused", default=False)

View File

@ -14,6 +14,7 @@ pre-train with the development data, but also not *so* terrible: we're not using
the development labels, after all --- only the unlabelled text. the development labels, after all --- only the unlabelled text.
""" """
import plac import plac
import tqdm
import random import random
import spacy import spacy
import thinc.extra.datasets import thinc.extra.datasets
@ -106,9 +107,6 @@ def create_pipeline(width, embed_size, vectors_model):
def train_tensorizer(nlp, texts, dropout, n_iter): def train_tensorizer(nlp, texts, dropout, n_iter):
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
import tqdm
tensorizer = nlp.create_pipe("tensorizer") tensorizer = nlp.create_pipe("tensorizer")
nlp.add_pipe(tensorizer) nlp.add_pipe(tensorizer)
optimizer = nlp.begin_training() optimizer = nlp.begin_training()
@ -122,9 +120,6 @@ def train_tensorizer(nlp, texts, dropout, n_iter):
def train_textcat(nlp, n_texts, n_iter=10): def train_textcat(nlp, n_texts, n_iter=10):
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
import tqdm
textcat = nlp.get_pipe("textcat") textcat = nlp.get_pipe("textcat")
tok2vec_weights = textcat.model.tok2vec.to_bytes() tok2vec_weights = textcat.model.tok2vec.to_bytes()
(train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts) (train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts)

View File

@ -8,6 +8,7 @@ from __future__ import unicode_literals
from os import path from os import path
import tqdm
import math import math
import numpy import numpy
import plac import plac
@ -35,9 +36,6 @@ from tensorflow.contrib.tensorboard.plugins.projector import (
), ),
) )
def main(vectors_loc, out_loc, name="spaCy_vectors"): def main(vectors_loc, out_loc, name="spaCy_vectors"):
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
import tqdm
meta_file = "{}.tsv".format(name) meta_file = "{}.tsv".format(name)
out_meta_file = path.join(out_loc, meta_file) out_meta_file = path.join(out_loc, meta_file)

View File

@ -12,6 +12,7 @@ numpy>=1.15.0
requests>=2.13.0,<3.0.0 requests>=2.13.0,<3.0.0
plac>=0.9.6,<1.2.0 plac>=0.9.6,<1.2.0
pathlib==1.0.1; python_version < "3.4" pathlib==1.0.1; python_version < "3.4"
tqdm>=4.38.0,<5.0.0
# Optional dependencies # Optional dependencies
jsonschema>=2.6.0,<3.1.0 jsonschema>=2.6.0,<3.1.0
# Development dependencies # Development dependencies

View File

@ -3,6 +3,7 @@ from __future__ import unicode_literals
import plac import plac
import math import math
from tqdm import tqdm
import numpy import numpy
from ast import literal_eval from ast import literal_eval
from pathlib import Path from pathlib import Path
@ -116,9 +117,6 @@ def open_file(loc):
def read_attrs_from_deprecated(freqs_loc, clusters_loc): def read_attrs_from_deprecated(freqs_loc, clusters_loc):
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
from tqdm import tqdm
if freqs_loc is not None: if freqs_loc is not None:
with msg.loading("Counting frequencies..."): with msg.loading("Counting frequencies..."):
probs, _ = read_freqs(freqs_loc) probs, _ = read_freqs(freqs_loc)
@ -201,9 +199,6 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
def read_vectors(vectors_loc): def read_vectors(vectors_loc):
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
from tqdm import tqdm
f = open_file(vectors_loc) f = open_file(vectors_loc)
shape = tuple(int(size) for size in next(f).split()) shape = tuple(int(size) for size in next(f).split())
vectors_data = numpy.zeros(shape=shape, dtype="f") vectors_data = numpy.zeros(shape=shape, dtype="f")
@ -220,9 +215,6 @@ def read_vectors(vectors_loc):
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50): def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
from tqdm import tqdm
counts = PreshCounter() counts = PreshCounter()
total = 0 total = 0
with freqs_loc.open() as f: with freqs_loc.open() as f:
@ -252,9 +244,6 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
def read_clusters(clusters_loc): def read_clusters(clusters_loc):
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
from tqdm import tqdm
clusters = {} clusters = {}
if ftfy is None: if ftfy is None:
user_warning(Warnings.W004) user_warning(Warnings.W004)

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals, division, print_function from __future__ import unicode_literals, division, print_function
import plac import plac
import tqdm
from pathlib import Path from pathlib import Path
import srsly import srsly
import cProfile import cProfile
@ -46,9 +47,6 @@ def profile(model, inputs=None, n_texts=10000):
def parse_texts(nlp, texts): def parse_texts(nlp, texts):
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
import tqdm
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16): for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
pass pass

View File

@ -3,6 +3,7 @@ from __future__ import unicode_literals, division, print_function
import plac import plac
import os import os
import tqdm
from pathlib import Path from pathlib import Path
from thinc.neural._classes.model import Model from thinc.neural._classes.model import Model
from timeit import default_timer as timer from timeit import default_timer as timer
@ -85,10 +86,6 @@ def train(
JSON format. To convert data from other formats, use the `spacy convert` JSON format. To convert data from other formats, use the `spacy convert`
command. command.
""" """
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
import tqdm
util.fix_random_seed() util.fix_random_seed()
util.set_env_log(verbose) util.set_env_log(verbose)
@ -516,9 +513,6 @@ def _score_for_model(meta):
@contextlib.contextmanager @contextlib.contextmanager
def _create_progress_bar(total): def _create_progress_bar(total):
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
import tqdm
if int(os.environ.get("LOG_FRIENDLY", 0)): if int(os.environ.get("LOG_FRIENDLY", 0)):
yield yield
else: else:

View File

@ -53,7 +53,9 @@ class Warnings(object):
W009 = ("Custom factory '{name}' provided by entry points of another " W009 = ("Custom factory '{name}' provided by entry points of another "
"package overwrites built-in factory.") "package overwrites built-in factory.")
W010 = ("As of v2.1.0, the PhraseMatcher doesn't have a phrase length " W010 = ("As of v2.1.0, the PhraseMatcher doesn't have a phrase length "
"limit anymore, so the max_length argument is now deprecated.") "limit anymore, so the max_length argument is now deprecated. "
"If you did not specify this parameter, make sure you call the "
"constructor with named arguments instead of positional ones.")
W011 = ("It looks like you're calling displacy.serve from within a " W011 = ("It looks like you're calling displacy.serve from within a "
"Jupyter notebook or a similar environment. This likely means " "Jupyter notebook or a similar environment. This likely means "
"you're already running a local web server, so there's no need to " "you're already running a local web server, so there's no need to "

View File

@ -38,6 +38,7 @@ be shown.
| Name | Type | Description | | Name | Type | Description |
| --------------------------------------- | --------------- | ------------------------------------------------------------------------------------------- | | --------------------------------------- | --------------- | ------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. | | `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. |
| `max_length` | int | Deprecated argument - the `PhraseMatcher` does not have a phrase length limit anymore. |
| `attr` <Tag variant="new">2.1</Tag> | int / unicode | The token attribute to match on. Defaults to `ORTH`, i.e. the verbatim token text. | | `attr` <Tag variant="new">2.1</Tag> | int / unicode | The token attribute to match on. Defaults to `ORTH`, i.e. the verbatim token text. |
| `validate` <Tag variant="new">2.1</Tag> | bool | Validate patterns added to the matcher. | | `validate` <Tag variant="new">2.1</Tag> | bool | Validate patterns added to the matcher. |
| **RETURNS** | `PhraseMatcher` | The newly constructed object. | | **RETURNS** | `PhraseMatcher` | The newly constructed object. |