mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-01 04:46:38 +03:00
Merge branch 'master' into spacy.io
This commit is contained in:
commit
e5c319a051
10
README.md
10
README.md
|
@ -104,6 +104,13 @@ For detailed installation instructions, see the
|
|||
[pip]: https://pypi.org/project/spacy/
|
||||
[conda]: https://anaconda.org/conda-forge/spacy
|
||||
|
||||
> ⚠️ **Important note for Python 3.8:** We can't yet ship pre-compiled binary
|
||||
> wheels for spaCy that work on Python 3.8, as we're still waiting for our CI
|
||||
> providers and other tooling to support it. This means that in order to run
|
||||
> spaCy on Python 3.8, you'll need [a compiler installed](#source) and compile
|
||||
> the library and its Cython dependencies locally. If this is causing problems
|
||||
> for you, the easiest solution is to **use Python 3.7** in the meantime.
|
||||
|
||||
### pip
|
||||
|
||||
Using pip, spaCy releases are available as source packages and binary wheels (as
|
||||
|
@ -180,9 +187,6 @@ pointing pip to a path or URL.
|
|||
# download best-matching version of specific model for your spaCy installation
|
||||
python -m spacy download en_core_web_sm
|
||||
|
||||
# out-of-the-box: download best-matching default model
|
||||
python -m spacy download en
|
||||
|
||||
# pip install .tar.gz archive from path or URL
|
||||
pip install /Users/you/en_core_web_sm-2.2.0.tar.gz
|
||||
pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz
|
||||
|
|
|
@ -7,7 +7,6 @@ from __future__ import unicode_literals
|
|||
import plac
|
||||
from pathlib import Path
|
||||
import re
|
||||
import sys
|
||||
import json
|
||||
|
||||
import spacy
|
||||
|
@ -19,12 +18,9 @@ from spacy.util import compounding, minibatch, minibatch_by_words
|
|||
from spacy.syntax.nonproj import projectivize
|
||||
from spacy.matcher import Matcher
|
||||
from spacy import displacy
|
||||
from collections import defaultdict, Counter
|
||||
from timeit import default_timer as timer
|
||||
from collections import defaultdict
|
||||
|
||||
import itertools
|
||||
import random
|
||||
import numpy.random
|
||||
|
||||
from spacy import lang
|
||||
from spacy.lang import zh
|
||||
|
@ -323,10 +319,6 @@ def get_token_conllu(token, i):
|
|||
return "\n".join(lines)
|
||||
|
||||
|
||||
Token.set_extension("get_conllu_lines", method=get_token_conllu, force=True)
|
||||
Token.set_extension("begins_fused", default=False, force=True)
|
||||
Token.set_extension("inside_fused", default=False, force=True)
|
||||
|
||||
|
||||
##################
|
||||
# Initialization #
|
||||
|
@ -459,13 +451,13 @@ class TreebankPaths(object):
|
|||
|
||||
@plac.annotations(
|
||||
ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
|
||||
parses_dir=("Directory to write the development parses", "positional", None, Path),
|
||||
corpus=(
|
||||
"UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
|
||||
"UD corpus to train and evaluate on, e.g. UD_Spanish-AnCora",
|
||||
"positional",
|
||||
None,
|
||||
str,
|
||||
),
|
||||
parses_dir=("Directory to write the development parses", "positional", None, Path),
|
||||
config=("Path to json formatted config file", "option", "C", Path),
|
||||
limit=("Size limit", "option", "n", int),
|
||||
gpu_device=("Use GPU", "option", "g", int),
|
||||
|
@ -490,6 +482,10 @@ def main(
|
|||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||
import tqdm
|
||||
|
||||
Token.set_extension("get_conllu_lines", method=get_token_conllu)
|
||||
Token.set_extension("begins_fused", default=False)
|
||||
Token.set_extension("inside_fused", default=False)
|
||||
|
||||
spacy.util.fix_random_seed()
|
||||
lang.zh.Chinese.Defaults.use_jieba = False
|
||||
lang.ja.Japanese.Defaults.use_janome = False
|
||||
|
@ -506,8 +502,8 @@ def main(
|
|||
|
||||
docs, golds = read_data(
|
||||
nlp,
|
||||
paths.train.conllu.open(),
|
||||
paths.train.text.open(),
|
||||
paths.train.conllu.open(encoding="utf8"),
|
||||
paths.train.text.open(encoding="utf8"),
|
||||
max_doc_length=config.max_doc_length,
|
||||
limit=limit,
|
||||
)
|
||||
|
|
45
examples/load_from_docbin.py
Normal file
45
examples/load_from_docbin.py
Normal file
|
@ -0,0 +1,45 @@
|
|||
# coding: utf-8
|
||||
"""
|
||||
Example of loading previously parsed text using spaCy's DocBin class. The example
|
||||
performs an entity count to show that the annotations are available.
|
||||
For more details, see https://spacy.io/usage/saving-loading#docs
|
||||
Installation:
|
||||
python -m spacy download en_core_web_lg
|
||||
Usage:
|
||||
python examples/load_from_docbin.py en_core_web_lg RC_2015-03-9.spacy
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import spacy
|
||||
from spacy.tokens import DocBin
|
||||
from timeit import default_timer as timer
|
||||
from collections import Counter
|
||||
|
||||
EXAMPLE_PARSES_PATH = "RC_2015-03-9.spacy"
|
||||
|
||||
|
||||
def main(model="en_core_web_lg", docbin_path=EXAMPLE_PARSES_PATH):
|
||||
nlp = spacy.load(model)
|
||||
print("Reading data from {}".format(docbin_path))
|
||||
with open(docbin_path, "rb") as file_:
|
||||
bytes_data = file_.read()
|
||||
nr_word = 0
|
||||
start_time = timer()
|
||||
entities = Counter()
|
||||
docbin = DocBin().from_bytes(bytes_data)
|
||||
for doc in docbin.get_docs(nlp.vocab):
|
||||
nr_word += len(doc)
|
||||
entities.update((e.label_, e.text) for e in doc.ents)
|
||||
end_time = timer()
|
||||
msg = "Loaded {nr_word} words in {seconds} seconds ({wps} words per second)"
|
||||
wps = nr_word / (end_time - start_time)
|
||||
print(msg.format(nr_word=nr_word, seconds=end_time - start_time, wps=wps))
|
||||
print("Most common entities:")
|
||||
for (label, entity), freq in entities.most_common(30):
|
||||
print(freq, entity, label)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import plac
|
||||
|
||||
plac.call(main)
|
1
examples/training/conllu-config.json
Normal file
1
examples/training/conllu-config.json
Normal file
|
@ -0,0 +1 @@
|
|||
{"nr_epoch": 3, "batch_size": 24, "dropout": 0.001, "vectors": 0, "multitask_tag": 0, "multitask_sent": 0}
|
|
@ -13,8 +13,7 @@ import spacy.util
|
|||
from spacy.tokens import Token, Doc
|
||||
from spacy.gold import GoldParse
|
||||
from spacy.syntax.nonproj import projectivize
|
||||
from collections import defaultdict, Counter
|
||||
from timeit import default_timer as timer
|
||||
from collections import defaultdict
|
||||
from spacy.matcher import Matcher
|
||||
|
||||
import itertools
|
||||
|
@ -290,11 +289,6 @@ def get_token_conllu(token, i):
|
|||
return "\n".join(lines)
|
||||
|
||||
|
||||
Token.set_extension("get_conllu_lines", method=get_token_conllu)
|
||||
Token.set_extension("begins_fused", default=False)
|
||||
Token.set_extension("inside_fused", default=False)
|
||||
|
||||
|
||||
##################
|
||||
# Initialization #
|
||||
##################
|
||||
|
@ -381,20 +375,24 @@ class TreebankPaths(object):
|
|||
|
||||
@plac.annotations(
|
||||
ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
|
||||
parses_dir=("Directory to write the development parses", "positional", None, Path),
|
||||
config=("Path to json formatted config file", "positional", None, Config.load),
|
||||
corpus=(
|
||||
"UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
|
||||
"UD corpus to train and evaluate on, e.g. UD_Spanish-AnCora",
|
||||
"positional",
|
||||
None,
|
||||
str,
|
||||
),
|
||||
parses_dir=("Directory to write the development parses", "positional", None, Path),
|
||||
config=("Path to json formatted config file", "positional", None, Config.load),
|
||||
limit=("Size limit", "option", "n", int),
|
||||
)
|
||||
def main(ud_dir, parses_dir, config, corpus, limit=0):
|
||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||
import tqdm
|
||||
|
||||
Token.set_extension("get_conllu_lines", method=get_token_conllu)
|
||||
Token.set_extension("begins_fused", default=False)
|
||||
Token.set_extension("inside_fused", default=False)
|
||||
|
||||
paths = TreebankPaths(ud_dir, corpus)
|
||||
if not (parses_dir / corpus).exists():
|
||||
(parses_dir / corpus).mkdir()
|
||||
|
@ -403,8 +401,8 @@ def main(ud_dir, parses_dir, config, corpus, limit=0):
|
|||
|
||||
docs, golds = read_data(
|
||||
nlp,
|
||||
paths.train.conllu.open(),
|
||||
paths.train.text.open(),
|
||||
paths.train.conllu.open(encoding="utf8"),
|
||||
paths.train.text.open(encoding="utf8"),
|
||||
max_doc_length=config.max_doc_length,
|
||||
limit=limit,
|
||||
)
|
||||
|
|
|
@ -769,6 +769,7 @@ class Language(object):
|
|||
texts,
|
||||
batch_size=batch_size,
|
||||
disable=disable,
|
||||
n_process=n_process,
|
||||
component_cfg=component_cfg,
|
||||
)
|
||||
for doc, context in izip(docs, contexts):
|
||||
|
|
|
@ -20,6 +20,17 @@ available over [pip](https://pypi.python.org/pypi/spacy) and
|
|||
> possible, the new docs also include notes on features that have changed in
|
||||
> v2.0, and features that were introduced in the new version.
|
||||
|
||||
<Infobox variant="warning" title="Important note for Python 3.8">
|
||||
|
||||
We can't yet ship pre-compiled binary wheels for spaCy that work on Python 3.8,
|
||||
as we're still waiting for our CI providers and other tooling to support it.
|
||||
This means that in order to run spaCy on Python 3.8, you'll need
|
||||
[a compiler installed](#source) and compile the library and its Cython
|
||||
dependencies locally. If this is causing problems for you, the easiest solution
|
||||
is to **use Python 3.7** in the meantime.
|
||||
|
||||
</Infobox>
|
||||
|
||||
## Quickstart {hidden="true"}
|
||||
|
||||
import QuickstartInstall from 'widgets/quickstart-install.js'
|
||||
|
|
Loading…
Reference in New Issue
Block a user