mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-03 22:06:37 +03:00
Merge branch 'master' into spacy.io
This commit is contained in:
commit
e5c319a051
10
README.md
10
README.md
|
@ -104,6 +104,13 @@ For detailed installation instructions, see the
|
||||||
[pip]: https://pypi.org/project/spacy/
|
[pip]: https://pypi.org/project/spacy/
|
||||||
[conda]: https://anaconda.org/conda-forge/spacy
|
[conda]: https://anaconda.org/conda-forge/spacy
|
||||||
|
|
||||||
|
> ⚠️ **Important note for Python 3.8:** We can't yet ship pre-compiled binary
|
||||||
|
> wheels for spaCy that work on Python 3.8, as we're still waiting for our CI
|
||||||
|
> providers and other tooling to support it. This means that in order to run
|
||||||
|
> spaCy on Python 3.8, you'll need [a compiler installed](#source) and compile
|
||||||
|
> the library and its Cython dependencies locally. If this is causing problems
|
||||||
|
> for you, the easiest solution is to **use Python 3.7** in the meantime.
|
||||||
|
|
||||||
### pip
|
### pip
|
||||||
|
|
||||||
Using pip, spaCy releases are available as source packages and binary wheels (as
|
Using pip, spaCy releases are available as source packages and binary wheels (as
|
||||||
|
@ -180,9 +187,6 @@ pointing pip to a path or URL.
|
||||||
# download best-matching version of specific model for your spaCy installation
|
# download best-matching version of specific model for your spaCy installation
|
||||||
python -m spacy download en_core_web_sm
|
python -m spacy download en_core_web_sm
|
||||||
|
|
||||||
# out-of-the-box: download best-matching default model
|
|
||||||
python -m spacy download en
|
|
||||||
|
|
||||||
# pip install .tar.gz archive from path or URL
|
# pip install .tar.gz archive from path or URL
|
||||||
pip install /Users/you/en_core_web_sm-2.2.0.tar.gz
|
pip install /Users/you/en_core_web_sm-2.2.0.tar.gz
|
||||||
pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz
|
pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz
|
||||||
|
|
|
@ -7,7 +7,6 @@ from __future__ import unicode_literals
|
||||||
import plac
|
import plac
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import re
|
import re
|
||||||
import sys
|
|
||||||
import json
|
import json
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
|
@ -19,12 +18,9 @@ from spacy.util import compounding, minibatch, minibatch_by_words
|
||||||
from spacy.syntax.nonproj import projectivize
|
from spacy.syntax.nonproj import projectivize
|
||||||
from spacy.matcher import Matcher
|
from spacy.matcher import Matcher
|
||||||
from spacy import displacy
|
from spacy import displacy
|
||||||
from collections import defaultdict, Counter
|
from collections import defaultdict
|
||||||
from timeit import default_timer as timer
|
|
||||||
|
|
||||||
import itertools
|
|
||||||
import random
|
import random
|
||||||
import numpy.random
|
|
||||||
|
|
||||||
from spacy import lang
|
from spacy import lang
|
||||||
from spacy.lang import zh
|
from spacy.lang import zh
|
||||||
|
@ -323,10 +319,6 @@ def get_token_conllu(token, i):
|
||||||
return "\n".join(lines)
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
Token.set_extension("get_conllu_lines", method=get_token_conllu, force=True)
|
|
||||||
Token.set_extension("begins_fused", default=False, force=True)
|
|
||||||
Token.set_extension("inside_fused", default=False, force=True)
|
|
||||||
|
|
||||||
|
|
||||||
##################
|
##################
|
||||||
# Initialization #
|
# Initialization #
|
||||||
|
@ -459,13 +451,13 @@ class TreebankPaths(object):
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
|
ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
|
||||||
|
parses_dir=("Directory to write the development parses", "positional", None, Path),
|
||||||
corpus=(
|
corpus=(
|
||||||
"UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
|
"UD corpus to train and evaluate on, e.g. UD_Spanish-AnCora",
|
||||||
"positional",
|
"positional",
|
||||||
None,
|
None,
|
||||||
str,
|
str,
|
||||||
),
|
),
|
||||||
parses_dir=("Directory to write the development parses", "positional", None, Path),
|
|
||||||
config=("Path to json formatted config file", "option", "C", Path),
|
config=("Path to json formatted config file", "option", "C", Path),
|
||||||
limit=("Size limit", "option", "n", int),
|
limit=("Size limit", "option", "n", int),
|
||||||
gpu_device=("Use GPU", "option", "g", int),
|
gpu_device=("Use GPU", "option", "g", int),
|
||||||
|
@ -490,6 +482,10 @@ def main(
|
||||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||||
import tqdm
|
import tqdm
|
||||||
|
|
||||||
|
Token.set_extension("get_conllu_lines", method=get_token_conllu)
|
||||||
|
Token.set_extension("begins_fused", default=False)
|
||||||
|
Token.set_extension("inside_fused", default=False)
|
||||||
|
|
||||||
spacy.util.fix_random_seed()
|
spacy.util.fix_random_seed()
|
||||||
lang.zh.Chinese.Defaults.use_jieba = False
|
lang.zh.Chinese.Defaults.use_jieba = False
|
||||||
lang.ja.Japanese.Defaults.use_janome = False
|
lang.ja.Japanese.Defaults.use_janome = False
|
||||||
|
@ -506,8 +502,8 @@ def main(
|
||||||
|
|
||||||
docs, golds = read_data(
|
docs, golds = read_data(
|
||||||
nlp,
|
nlp,
|
||||||
paths.train.conllu.open(),
|
paths.train.conllu.open(encoding="utf8"),
|
||||||
paths.train.text.open(),
|
paths.train.text.open(encoding="utf8"),
|
||||||
max_doc_length=config.max_doc_length,
|
max_doc_length=config.max_doc_length,
|
||||||
limit=limit,
|
limit=limit,
|
||||||
)
|
)
|
||||||
|
|
45
examples/load_from_docbin.py
Normal file
45
examples/load_from_docbin.py
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
# coding: utf-8
|
||||||
|
"""
|
||||||
|
Example of loading previously parsed text using spaCy's DocBin class. The example
|
||||||
|
performs an entity count to show that the annotations are available.
|
||||||
|
For more details, see https://spacy.io/usage/saving-loading#docs
|
||||||
|
Installation:
|
||||||
|
python -m spacy download en_core_web_lg
|
||||||
|
Usage:
|
||||||
|
python examples/load_from_docbin.py en_core_web_lg RC_2015-03-9.spacy
|
||||||
|
"""
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import spacy
|
||||||
|
from spacy.tokens import DocBin
|
||||||
|
from timeit import default_timer as timer
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
|
EXAMPLE_PARSES_PATH = "RC_2015-03-9.spacy"
|
||||||
|
|
||||||
|
|
||||||
|
def main(model="en_core_web_lg", docbin_path=EXAMPLE_PARSES_PATH):
|
||||||
|
nlp = spacy.load(model)
|
||||||
|
print("Reading data from {}".format(docbin_path))
|
||||||
|
with open(docbin_path, "rb") as file_:
|
||||||
|
bytes_data = file_.read()
|
||||||
|
nr_word = 0
|
||||||
|
start_time = timer()
|
||||||
|
entities = Counter()
|
||||||
|
docbin = DocBin().from_bytes(bytes_data)
|
||||||
|
for doc in docbin.get_docs(nlp.vocab):
|
||||||
|
nr_word += len(doc)
|
||||||
|
entities.update((e.label_, e.text) for e in doc.ents)
|
||||||
|
end_time = timer()
|
||||||
|
msg = "Loaded {nr_word} words in {seconds} seconds ({wps} words per second)"
|
||||||
|
wps = nr_word / (end_time - start_time)
|
||||||
|
print(msg.format(nr_word=nr_word, seconds=end_time - start_time, wps=wps))
|
||||||
|
print("Most common entities:")
|
||||||
|
for (label, entity), freq in entities.most_common(30):
|
||||||
|
print(freq, entity, label)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import plac
|
||||||
|
|
||||||
|
plac.call(main)
|
1
examples/training/conllu-config.json
Normal file
1
examples/training/conllu-config.json
Normal file
|
@ -0,0 +1 @@
|
||||||
|
{"nr_epoch": 3, "batch_size": 24, "dropout": 0.001, "vectors": 0, "multitask_tag": 0, "multitask_sent": 0}
|
|
@ -13,8 +13,7 @@ import spacy.util
|
||||||
from spacy.tokens import Token, Doc
|
from spacy.tokens import Token, Doc
|
||||||
from spacy.gold import GoldParse
|
from spacy.gold import GoldParse
|
||||||
from spacy.syntax.nonproj import projectivize
|
from spacy.syntax.nonproj import projectivize
|
||||||
from collections import defaultdict, Counter
|
from collections import defaultdict
|
||||||
from timeit import default_timer as timer
|
|
||||||
from spacy.matcher import Matcher
|
from spacy.matcher import Matcher
|
||||||
|
|
||||||
import itertools
|
import itertools
|
||||||
|
@ -290,11 +289,6 @@ def get_token_conllu(token, i):
|
||||||
return "\n".join(lines)
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
Token.set_extension("get_conllu_lines", method=get_token_conllu)
|
|
||||||
Token.set_extension("begins_fused", default=False)
|
|
||||||
Token.set_extension("inside_fused", default=False)
|
|
||||||
|
|
||||||
|
|
||||||
##################
|
##################
|
||||||
# Initialization #
|
# Initialization #
|
||||||
##################
|
##################
|
||||||
|
@ -381,20 +375,24 @@ class TreebankPaths(object):
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
|
ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
|
||||||
|
parses_dir=("Directory to write the development parses", "positional", None, Path),
|
||||||
|
config=("Path to json formatted config file", "positional", None, Config.load),
|
||||||
corpus=(
|
corpus=(
|
||||||
"UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
|
"UD corpus to train and evaluate on, e.g. UD_Spanish-AnCora",
|
||||||
"positional",
|
"positional",
|
||||||
None,
|
None,
|
||||||
str,
|
str,
|
||||||
),
|
),
|
||||||
parses_dir=("Directory to write the development parses", "positional", None, Path),
|
|
||||||
config=("Path to json formatted config file", "positional", None, Config.load),
|
|
||||||
limit=("Size limit", "option", "n", int),
|
limit=("Size limit", "option", "n", int),
|
||||||
)
|
)
|
||||||
def main(ud_dir, parses_dir, config, corpus, limit=0):
|
def main(ud_dir, parses_dir, config, corpus, limit=0):
|
||||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||||
import tqdm
|
import tqdm
|
||||||
|
|
||||||
|
Token.set_extension("get_conllu_lines", method=get_token_conllu)
|
||||||
|
Token.set_extension("begins_fused", default=False)
|
||||||
|
Token.set_extension("inside_fused", default=False)
|
||||||
|
|
||||||
paths = TreebankPaths(ud_dir, corpus)
|
paths = TreebankPaths(ud_dir, corpus)
|
||||||
if not (parses_dir / corpus).exists():
|
if not (parses_dir / corpus).exists():
|
||||||
(parses_dir / corpus).mkdir()
|
(parses_dir / corpus).mkdir()
|
||||||
|
@ -403,8 +401,8 @@ def main(ud_dir, parses_dir, config, corpus, limit=0):
|
||||||
|
|
||||||
docs, golds = read_data(
|
docs, golds = read_data(
|
||||||
nlp,
|
nlp,
|
||||||
paths.train.conllu.open(),
|
paths.train.conllu.open(encoding="utf8"),
|
||||||
paths.train.text.open(),
|
paths.train.text.open(encoding="utf8"),
|
||||||
max_doc_length=config.max_doc_length,
|
max_doc_length=config.max_doc_length,
|
||||||
limit=limit,
|
limit=limit,
|
||||||
)
|
)
|
||||||
|
|
|
@ -769,6 +769,7 @@ class Language(object):
|
||||||
texts,
|
texts,
|
||||||
batch_size=batch_size,
|
batch_size=batch_size,
|
||||||
disable=disable,
|
disable=disable,
|
||||||
|
n_process=n_process,
|
||||||
component_cfg=component_cfg,
|
component_cfg=component_cfg,
|
||||||
)
|
)
|
||||||
for doc, context in izip(docs, contexts):
|
for doc, context in izip(docs, contexts):
|
||||||
|
|
|
@ -20,6 +20,17 @@ available over [pip](https://pypi.python.org/pypi/spacy) and
|
||||||
> possible, the new docs also include notes on features that have changed in
|
> possible, the new docs also include notes on features that have changed in
|
||||||
> v2.0, and features that were introduced in the new version.
|
> v2.0, and features that were introduced in the new version.
|
||||||
|
|
||||||
|
<Infobox variant="warning" title="Important note for Python 3.8">
|
||||||
|
|
||||||
|
We can't yet ship pre-compiled binary wheels for spaCy that work on Python 3.8,
|
||||||
|
as we're still waiting for our CI providers and other tooling to support it.
|
||||||
|
This means that in order to run spaCy on Python 3.8, you'll need
|
||||||
|
[a compiler installed](#source) and compile the library and its Cython
|
||||||
|
dependencies locally. If this is causing problems for you, the easiest solution
|
||||||
|
is to **use Python 3.7** in the meantime.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
## Quickstart {hidden="true"}
|
## Quickstart {hidden="true"}
|
||||||
|
|
||||||
import QuickstartInstall from 'widgets/quickstart-install.js'
|
import QuickstartInstall from 'widgets/quickstart-install.js'
|
||||||
|
|
Loading…
Reference in New Issue
Block a user