Merge branch 'master' into spacy.io

This commit is contained in:
Ines Montani 2019-11-05 18:30:46 +01:00
commit e5c319a051
7 changed files with 84 additions and 28 deletions

View File

@ -104,6 +104,13 @@ For detailed installation instructions, see the
[pip]: https://pypi.org/project/spacy/
[conda]: https://anaconda.org/conda-forge/spacy
> ⚠️ **Important note for Python 3.8:** We can't yet ship pre-compiled binary
> wheels for spaCy that work on Python 3.8, as we're still waiting for our CI
> providers and other tooling to support it. This means that in order to run
> spaCy on Python 3.8, you'll need [a compiler installed](#source) and compile
> the library and its Cython dependencies locally. If this is causing problems
> for you, the easiest solution is to **use Python 3.7** in the meantime.
### pip
Using pip, spaCy releases are available as source packages and binary wheels (as
@ -180,9 +187,6 @@ pointing pip to a path or URL.
# download best-matching version of specific model for your spaCy installation
python -m spacy download en_core_web_sm
# out-of-the-box: download best-matching default model
python -m spacy download en
# pip install .tar.gz archive from path or URL
pip install /Users/you/en_core_web_sm-2.2.0.tar.gz
pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz

View File

@ -7,7 +7,6 @@ from __future__ import unicode_literals
import plac
from pathlib import Path
import re
import sys
import json
import spacy
@ -19,12 +18,9 @@ from spacy.util import compounding, minibatch, minibatch_by_words
from spacy.syntax.nonproj import projectivize
from spacy.matcher import Matcher
from spacy import displacy
from collections import defaultdict, Counter
from timeit import default_timer as timer
from collections import defaultdict
import itertools
import random
import numpy.random
from spacy import lang
from spacy.lang import zh
@ -323,10 +319,6 @@ def get_token_conllu(token, i):
return "\n".join(lines)
Token.set_extension("get_conllu_lines", method=get_token_conllu, force=True)
Token.set_extension("begins_fused", default=False, force=True)
Token.set_extension("inside_fused", default=False, force=True)
##################
# Initialization #
@ -459,13 +451,13 @@ class TreebankPaths(object):
@plac.annotations(
ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
parses_dir=("Directory to write the development parses", "positional", None, Path),
corpus=(
"UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
"UD corpus to train and evaluate on, e.g. UD_Spanish-AnCora",
"positional",
None,
str,
),
parses_dir=("Directory to write the development parses", "positional", None, Path),
config=("Path to json formatted config file", "option", "C", Path),
limit=("Size limit", "option", "n", int),
gpu_device=("Use GPU", "option", "g", int),
@ -490,6 +482,10 @@ def main(
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
import tqdm
Token.set_extension("get_conllu_lines", method=get_token_conllu)
Token.set_extension("begins_fused", default=False)
Token.set_extension("inside_fused", default=False)
spacy.util.fix_random_seed()
lang.zh.Chinese.Defaults.use_jieba = False
lang.ja.Japanese.Defaults.use_janome = False
@ -506,8 +502,8 @@ def main(
docs, golds = read_data(
nlp,
paths.train.conllu.open(),
paths.train.text.open(),
paths.train.conllu.open(encoding="utf8"),
paths.train.text.open(encoding="utf8"),
max_doc_length=config.max_doc_length,
limit=limit,
)

View File

@ -0,0 +1,45 @@
# coding: utf-8
"""
Example of loading previously parsed text using spaCy's DocBin class. The example
performs an entity count to show that the annotations are available.
For more details, see https://spacy.io/usage/saving-loading#docs
Installation:
python -m spacy download en_core_web_lg
Usage:
python examples/load_from_docbin.py en_core_web_lg RC_2015-03-9.spacy
"""
from __future__ import unicode_literals
import spacy
from spacy.tokens import DocBin
from timeit import default_timer as timer
from collections import Counter
EXAMPLE_PARSES_PATH = "RC_2015-03-9.spacy"
def main(model="en_core_web_lg", docbin_path=EXAMPLE_PARSES_PATH):
nlp = spacy.load(model)
print("Reading data from {}".format(docbin_path))
with open(docbin_path, "rb") as file_:
bytes_data = file_.read()
nr_word = 0
start_time = timer()
entities = Counter()
docbin = DocBin().from_bytes(bytes_data)
for doc in docbin.get_docs(nlp.vocab):
nr_word += len(doc)
entities.update((e.label_, e.text) for e in doc.ents)
end_time = timer()
msg = "Loaded {nr_word} words in {seconds} seconds ({wps} words per second)"
wps = nr_word / (end_time - start_time)
print(msg.format(nr_word=nr_word, seconds=end_time - start_time, wps=wps))
print("Most common entities:")
for (label, entity), freq in entities.most_common(30):
print(freq, entity, label)
if __name__ == "__main__":
import plac
plac.call(main)

View File

@ -0,0 +1 @@
{"nr_epoch": 3, "batch_size": 24, "dropout": 0.001, "vectors": 0, "multitask_tag": 0, "multitask_sent": 0}

View File

@ -13,8 +13,7 @@ import spacy.util
from spacy.tokens import Token, Doc
from spacy.gold import GoldParse
from spacy.syntax.nonproj import projectivize
from collections import defaultdict, Counter
from timeit import default_timer as timer
from collections import defaultdict
from spacy.matcher import Matcher
import itertools
@ -290,11 +289,6 @@ def get_token_conllu(token, i):
return "\n".join(lines)
Token.set_extension("get_conllu_lines", method=get_token_conllu)
Token.set_extension("begins_fused", default=False)
Token.set_extension("inside_fused", default=False)
##################
# Initialization #
##################
@ -381,20 +375,24 @@ class TreebankPaths(object):
@plac.annotations(
ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
parses_dir=("Directory to write the development parses", "positional", None, Path),
config=("Path to json formatted config file", "positional", None, Config.load),
corpus=(
"UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
"UD corpus to train and evaluate on, e.g. UD_Spanish-AnCora",
"positional",
None,
str,
),
parses_dir=("Directory to write the development parses", "positional", None, Path),
config=("Path to json formatted config file", "positional", None, Config.load),
limit=("Size limit", "option", "n", int),
)
def main(ud_dir, parses_dir, config, corpus, limit=0):
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
import tqdm
Token.set_extension("get_conllu_lines", method=get_token_conllu)
Token.set_extension("begins_fused", default=False)
Token.set_extension("inside_fused", default=False)
paths = TreebankPaths(ud_dir, corpus)
if not (parses_dir / corpus).exists():
(parses_dir / corpus).mkdir()
@ -403,8 +401,8 @@ def main(ud_dir, parses_dir, config, corpus, limit=0):
docs, golds = read_data(
nlp,
paths.train.conllu.open(),
paths.train.text.open(),
paths.train.conllu.open(encoding="utf8"),
paths.train.text.open(encoding="utf8"),
max_doc_length=config.max_doc_length,
limit=limit,
)

View File

@ -769,6 +769,7 @@ class Language(object):
texts,
batch_size=batch_size,
disable=disable,
n_process=n_process,
component_cfg=component_cfg,
)
for doc, context in izip(docs, contexts):

View File

@ -20,6 +20,17 @@ available over [pip](https://pypi.python.org/pypi/spacy) and
> possible, the new docs also include notes on features that have changed in
> v2.0, and features that were introduced in the new version.
<Infobox variant="warning" title="Important note for Python 3.8">
We can't yet ship pre-compiled binary wheels for spaCy that work on Python 3.8,
as we're still waiting for our CI providers and other tooling to support it.
This means that in order to run spaCy on Python 3.8, you'll need
[a compiler installed](#source) and compile the library and its Cython
dependencies locally. If this is causing problems for you, the easiest solution
is to **use Python 3.7** in the meantime.
</Infobox>
## Quickstart {hidden="true"}
import QuickstartInstall from 'widgets/quickstart-install.js'