mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-03 22:06:37 +03:00
Merge branch 'master' into spacy.io
This commit is contained in:
commit
1bb11953e8
|
@ -8,6 +8,7 @@ import plac
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
|
import tqdm
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
import spacy.util
|
import spacy.util
|
||||||
|
@ -486,9 +487,6 @@ def main(
|
||||||
vectors_dir=None,
|
vectors_dir=None,
|
||||||
use_oracle_segments=False,
|
use_oracle_segments=False,
|
||||||
):
|
):
|
||||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
|
||||||
import tqdm
|
|
||||||
|
|
||||||
Token.set_extension("get_conllu_lines", method=get_token_conllu)
|
Token.set_extension("get_conllu_lines", method=get_token_conllu)
|
||||||
Token.set_extension("begins_fused", default=False)
|
Token.set_extension("begins_fused", default=False)
|
||||||
Token.set_extension("inside_fused", default=False)
|
Token.set_extension("inside_fused", default=False)
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import logging
|
import logging
|
||||||
import random
|
import random
|
||||||
|
|
||||||
|
from tqdm import tqdm
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
@ -119,8 +120,6 @@ def get_eval_results(data, el_pipe=None):
|
||||||
Only evaluate entities that overlap between gold and NER, to isolate the performance of the NEL.
|
Only evaluate entities that overlap between gold and NER, to isolate the performance of the NEL.
|
||||||
If the docs in the data require further processing with an entity linker, set el_pipe.
|
If the docs in the data require further processing with an entity linker, set el_pipe.
|
||||||
"""
|
"""
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
docs = []
|
docs = []
|
||||||
golds = []
|
golds = []
|
||||||
for d, g in tqdm(data, leave=False):
|
for d, g in tqdm(data, leave=False):
|
||||||
|
|
|
@ -6,6 +6,7 @@ import bz2
|
||||||
import logging
|
import logging
|
||||||
import random
|
import random
|
||||||
import json
|
import json
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
||||||
|
@ -457,9 +458,6 @@ def read_training(nlp, entity_file_path, dev, limit, kb, labels_discard=None):
|
||||||
""" This method provides training examples that correspond to the entity annotations found by the nlp object.
|
""" This method provides training examples that correspond to the entity annotations found by the nlp object.
|
||||||
For training, it will include both positive and negative examples by using the candidate generator from the kb.
|
For training, it will include both positive and negative examples by using the candidate generator from the kb.
|
||||||
For testing (kb=None), it will include all positive examples only."""
|
For testing (kb=None), it will include all positive examples only."""
|
||||||
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
if not labels_discard:
|
if not labels_discard:
|
||||||
labels_discard = []
|
labels_discard = []
|
||||||
|
|
||||||
|
|
|
@ -7,6 +7,7 @@ import attr
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
|
import tqdm
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
import spacy.util
|
import spacy.util
|
||||||
|
@ -386,9 +387,6 @@ class TreebankPaths(object):
|
||||||
limit=("Size limit", "option", "n", int),
|
limit=("Size limit", "option", "n", int),
|
||||||
)
|
)
|
||||||
def main(ud_dir, parses_dir, config, corpus, limit=0):
|
def main(ud_dir, parses_dir, config, corpus, limit=0):
|
||||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
|
||||||
import tqdm
|
|
||||||
|
|
||||||
Token.set_extension("get_conllu_lines", method=get_token_conllu)
|
Token.set_extension("get_conllu_lines", method=get_token_conllu)
|
||||||
Token.set_extension("begins_fused", default=False)
|
Token.set_extension("begins_fused", default=False)
|
||||||
Token.set_extension("inside_fused", default=False)
|
Token.set_extension("inside_fused", default=False)
|
||||||
|
|
|
@ -14,6 +14,7 @@ pre-train with the development data, but also not *so* terrible: we're not using
|
||||||
the development labels, after all --- only the unlabelled text.
|
the development labels, after all --- only the unlabelled text.
|
||||||
"""
|
"""
|
||||||
import plac
|
import plac
|
||||||
|
import tqdm
|
||||||
import random
|
import random
|
||||||
import spacy
|
import spacy
|
||||||
import thinc.extra.datasets
|
import thinc.extra.datasets
|
||||||
|
@ -106,9 +107,6 @@ def create_pipeline(width, embed_size, vectors_model):
|
||||||
|
|
||||||
|
|
||||||
def train_tensorizer(nlp, texts, dropout, n_iter):
|
def train_tensorizer(nlp, texts, dropout, n_iter):
|
||||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
|
||||||
import tqdm
|
|
||||||
|
|
||||||
tensorizer = nlp.create_pipe("tensorizer")
|
tensorizer = nlp.create_pipe("tensorizer")
|
||||||
nlp.add_pipe(tensorizer)
|
nlp.add_pipe(tensorizer)
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.begin_training()
|
||||||
|
@ -122,9 +120,6 @@ def train_tensorizer(nlp, texts, dropout, n_iter):
|
||||||
|
|
||||||
|
|
||||||
def train_textcat(nlp, n_texts, n_iter=10):
|
def train_textcat(nlp, n_texts, n_iter=10):
|
||||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
|
||||||
import tqdm
|
|
||||||
|
|
||||||
textcat = nlp.get_pipe("textcat")
|
textcat = nlp.get_pipe("textcat")
|
||||||
tok2vec_weights = textcat.model.tok2vec.to_bytes()
|
tok2vec_weights = textcat.model.tok2vec.to_bytes()
|
||||||
(train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts)
|
(train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts)
|
||||||
|
|
|
@ -8,6 +8,7 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from os import path
|
from os import path
|
||||||
|
|
||||||
|
import tqdm
|
||||||
import math
|
import math
|
||||||
import numpy
|
import numpy
|
||||||
import plac
|
import plac
|
||||||
|
@ -35,9 +36,6 @@ from tensorflow.contrib.tensorboard.plugins.projector import (
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
def main(vectors_loc, out_loc, name="spaCy_vectors"):
|
def main(vectors_loc, out_loc, name="spaCy_vectors"):
|
||||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
|
||||||
import tqdm
|
|
||||||
|
|
||||||
meta_file = "{}.tsv".format(name)
|
meta_file = "{}.tsv".format(name)
|
||||||
out_meta_file = path.join(out_loc, meta_file)
|
out_meta_file = path.join(out_loc, meta_file)
|
||||||
|
|
||||||
|
|
|
@ -12,6 +12,7 @@ numpy>=1.15.0
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
plac>=0.9.6,<1.2.0
|
plac>=0.9.6,<1.2.0
|
||||||
pathlib==1.0.1; python_version < "3.4"
|
pathlib==1.0.1; python_version < "3.4"
|
||||||
|
tqdm>=4.38.0,<5.0.0
|
||||||
# Optional dependencies
|
# Optional dependencies
|
||||||
jsonschema>=2.6.0,<3.1.0
|
jsonschema>=2.6.0,<3.1.0
|
||||||
# Development dependencies
|
# Development dependencies
|
||||||
|
|
|
@ -3,6 +3,7 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import math
|
import math
|
||||||
|
from tqdm import tqdm
|
||||||
import numpy
|
import numpy
|
||||||
from ast import literal_eval
|
from ast import literal_eval
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -116,9 +117,6 @@ def open_file(loc):
|
||||||
|
|
||||||
|
|
||||||
def read_attrs_from_deprecated(freqs_loc, clusters_loc):
|
def read_attrs_from_deprecated(freqs_loc, clusters_loc):
|
||||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
if freqs_loc is not None:
|
if freqs_loc is not None:
|
||||||
with msg.loading("Counting frequencies..."):
|
with msg.loading("Counting frequencies..."):
|
||||||
probs, _ = read_freqs(freqs_loc)
|
probs, _ = read_freqs(freqs_loc)
|
||||||
|
@ -201,9 +199,6 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
|
||||||
|
|
||||||
|
|
||||||
def read_vectors(vectors_loc):
|
def read_vectors(vectors_loc):
|
||||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
f = open_file(vectors_loc)
|
f = open_file(vectors_loc)
|
||||||
shape = tuple(int(size) for size in next(f).split())
|
shape = tuple(int(size) for size in next(f).split())
|
||||||
vectors_data = numpy.zeros(shape=shape, dtype="f")
|
vectors_data = numpy.zeros(shape=shape, dtype="f")
|
||||||
|
@ -220,9 +215,6 @@ def read_vectors(vectors_loc):
|
||||||
|
|
||||||
|
|
||||||
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
|
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
|
||||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
counts = PreshCounter()
|
counts = PreshCounter()
|
||||||
total = 0
|
total = 0
|
||||||
with freqs_loc.open() as f:
|
with freqs_loc.open() as f:
|
||||||
|
@ -252,9 +244,6 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
|
||||||
|
|
||||||
|
|
||||||
def read_clusters(clusters_loc):
|
def read_clusters(clusters_loc):
|
||||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
clusters = {}
|
clusters = {}
|
||||||
if ftfy is None:
|
if ftfy is None:
|
||||||
user_warning(Warnings.W004)
|
user_warning(Warnings.W004)
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
from __future__ import unicode_literals, division, print_function
|
from __future__ import unicode_literals, division, print_function
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
|
import tqdm
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import srsly
|
import srsly
|
||||||
import cProfile
|
import cProfile
|
||||||
|
@ -46,9 +47,6 @@ def profile(model, inputs=None, n_texts=10000):
|
||||||
|
|
||||||
|
|
||||||
def parse_texts(nlp, texts):
|
def parse_texts(nlp, texts):
|
||||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
|
||||||
import tqdm
|
|
||||||
|
|
||||||
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
|
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,7 @@ from __future__ import unicode_literals, division, print_function
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import os
|
import os
|
||||||
|
import tqdm
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from thinc.neural._classes.model import Model
|
from thinc.neural._classes.model import Model
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
|
@ -85,10 +86,6 @@ def train(
|
||||||
JSON format. To convert data from other formats, use the `spacy convert`
|
JSON format. To convert data from other formats, use the `spacy convert`
|
||||||
command.
|
command.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
|
||||||
import tqdm
|
|
||||||
|
|
||||||
util.fix_random_seed()
|
util.fix_random_seed()
|
||||||
util.set_env_log(verbose)
|
util.set_env_log(verbose)
|
||||||
|
|
||||||
|
@ -516,9 +513,6 @@ def _score_for_model(meta):
|
||||||
|
|
||||||
@contextlib.contextmanager
|
@contextlib.contextmanager
|
||||||
def _create_progress_bar(total):
|
def _create_progress_bar(total):
|
||||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
|
||||||
import tqdm
|
|
||||||
|
|
||||||
if int(os.environ.get("LOG_FRIENDLY", 0)):
|
if int(os.environ.get("LOG_FRIENDLY", 0)):
|
||||||
yield
|
yield
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -53,7 +53,9 @@ class Warnings(object):
|
||||||
W009 = ("Custom factory '{name}' provided by entry points of another "
|
W009 = ("Custom factory '{name}' provided by entry points of another "
|
||||||
"package overwrites built-in factory.")
|
"package overwrites built-in factory.")
|
||||||
W010 = ("As of v2.1.0, the PhraseMatcher doesn't have a phrase length "
|
W010 = ("As of v2.1.0, the PhraseMatcher doesn't have a phrase length "
|
||||||
"limit anymore, so the max_length argument is now deprecated.")
|
"limit anymore, so the max_length argument is now deprecated. "
|
||||||
|
"If you did not specify this parameter, make sure you call the "
|
||||||
|
"constructor with named arguments instead of positional ones.")
|
||||||
W011 = ("It looks like you're calling displacy.serve from within a "
|
W011 = ("It looks like you're calling displacy.serve from within a "
|
||||||
"Jupyter notebook or a similar environment. This likely means "
|
"Jupyter notebook or a similar environment. This likely means "
|
||||||
"you're already running a local web server, so there's no need to "
|
"you're already running a local web server, so there's no need to "
|
||||||
|
|
|
@ -38,6 +38,7 @@ be shown.
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| --------------------------------------- | --------------- | ------------------------------------------------------------------------------------------- |
|
| --------------------------------------- | --------------- | ------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. |
|
| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. |
|
||||||
|
| `max_length` | int | Deprecated argument - the `PhraseMatcher` does not have a phrase length limit anymore. |
|
||||||
| `attr` <Tag variant="new">2.1</Tag> | int / unicode | The token attribute to match on. Defaults to `ORTH`, i.e. the verbatim token text. |
|
| `attr` <Tag variant="new">2.1</Tag> | int / unicode | The token attribute to match on. Defaults to `ORTH`, i.e. the verbatim token text. |
|
||||||
| `validate` <Tag variant="new">2.1</Tag> | bool | Validate patterns added to the matcher. |
|
| `validate` <Tag variant="new">2.1</Tag> | bool | Validate patterns added to the matcher. |
|
||||||
| **RETURNS** | `PhraseMatcher` | The newly constructed object. |
|
| **RETURNS** | `PhraseMatcher` | The newly constructed object. |
|
||||||
|
|
Loading…
Reference in New Issue
Block a user