mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Merge branch 'develop' into support-danish
This commit is contained in:
commit
facf77e541
|
@ -1 +1,55 @@
|
|||
environment:
|
||||
|
||||
matrix:
|
||||
|
||||
# For Python versions available on Appveyor, see
|
||||
# http://www.appveyor.com/docs/installed-software#python
|
||||
# The list here is complete (excluding Python 2.6, which
|
||||
# isn't covered by this document) at the time of writing.
|
||||
|
||||
- PYTHON: "C:\\Python27"
|
||||
#- PYTHON: "C:\\Python33"
|
||||
#- PYTHON: "C:\\Python34"
|
||||
#- PYTHON: "C:\\Python35"
|
||||
#- PYTHON: "C:\\Python27-x64"
|
||||
#- PYTHON: "C:\\Python33-x64"
|
||||
#- DISTUTILS_USE_SDK: "1"
|
||||
#- PYTHON: "C:\\Python34-x64"
|
||||
#- DISTUTILS_USE_SDK: "1"
|
||||
#- PYTHON: "C:\\Python35-x64"
|
||||
- PYTHON: "C:\\Python36-x64"
|
||||
|
||||
install:
|
||||
# We need wheel installed to build wheels
|
||||
- "%PYTHON%\\python.exe -m pip install wheel"
|
||||
- "%PYTHON%\\python.exe -m pip install cython"
|
||||
- "%PYTHON%\\python.exe -m pip install -r requirements.txt"
|
||||
- "%PYTHON%\\python.exe -m pip install -e ."
|
||||
|
||||
build: off
|
||||
|
||||
test_script:
|
||||
# Put your test command here.
|
||||
# If you don't need to build C extensions on 64-bit Python 3.3 or 3.4,
|
||||
# you can remove "build.cmd" from the front of the command, as it's
|
||||
# only needed to support those cases.
|
||||
# Note that you must use the environment variable %PYTHON% to refer to
|
||||
# the interpreter you're using - Appveyor does not do anything special
|
||||
# to put the Python version you want to use on PATH.
|
||||
- "%PYTHON%\\python.exe -m pytest spacy/"
|
||||
|
||||
after_test:
|
||||
# This step builds your wheels.
|
||||
# Again, you only need build.cmd if you're building C extensions for
|
||||
# 64-bit Python 3.3/3.4. And you need to use %PYTHON% to get the correct
|
||||
# interpreter
|
||||
- "%PYTHON%\\python.exe setup.py bdist_wheel"
|
||||
|
||||
artifacts:
|
||||
# bdist_wheel puts your built wheel in the dist directory
|
||||
- path: dist\*
|
||||
|
||||
#on_success:
|
||||
# You can use this step to upload your artifacts to a public website.
|
||||
# See Appveyor's documentation for more details. Or you can simply
|
||||
# access your wheels from the Appveyor "artifacts" tab for your build.
|
||||
|
|
11
.buildkite/sdist.yml
Normal file
11
.buildkite/sdist.yml
Normal file
|
@ -0,0 +1,11 @@
|
|||
steps:
|
||||
-
|
||||
command: "fab env clean make test sdist"
|
||||
label: ":dizzy: :python:"
|
||||
artifact_paths: "dist/*.tar.gz"
|
||||
- wait
|
||||
- trigger: "spacy-sdist-against-models"
|
||||
label: ":dizzy: :hammer:"
|
||||
build:
|
||||
env:
|
||||
SPACY_VERSION: "{$SPACY_VERSION}"
|
106
.github/contributors/ramananbalakrishnan.md
vendored
Normal file
106
.github/contributors/ramananbalakrishnan.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [x] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | Ramanan Balakrishnan |
|
||||
| Company name (if applicable) | |
|
||||
| Title or role (if applicable) | |
|
||||
| Date | 2017-10-19 |
|
||||
| GitHub username | ramananbalakrishnan |
|
||||
| Website (optional) | |
|
5
.gitignore
vendored
5
.gitignore
vendored
|
@ -1,14 +1,12 @@
|
|||
# spaCy
|
||||
spacy/data/
|
||||
corpora/
|
||||
models/
|
||||
/models/
|
||||
keys/
|
||||
|
||||
# Website
|
||||
website/www/
|
||||
website/_deploy.sh
|
||||
website/package.json
|
||||
website/announcement.jade
|
||||
website/.gitignore
|
||||
|
||||
# Cython / C extensions
|
||||
|
@ -40,7 +38,6 @@ venv/
|
|||
|
||||
# Distribution / packaging
|
||||
env/
|
||||
bin/
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
|
|
|
@ -14,8 +14,7 @@ os:
|
|||
env:
|
||||
- VIA=compile LC_ALL=en_US.ascii
|
||||
- VIA=compile
|
||||
|
||||
# - VIA=sdist
|
||||
#- VIA=pypi_nightly
|
||||
|
||||
install:
|
||||
- "./travis.sh"
|
||||
|
@ -23,7 +22,7 @@ install:
|
|||
script:
|
||||
- "pip install pytest pytest-timeout"
|
||||
- if [[ "${VIA}" == "compile" ]]; then python -m pytest --tb=native spacy; fi
|
||||
- if [[ "${VIA}" == "pypi" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(ospath.dirname(spacy.__file__)))"`; fi
|
||||
- if [[ "${VIA}" == "pypi_nightly" ]]; then python -m pytest --tb=native --models --en `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
|
||||
- if [[ "${VIA}" == "sdist" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
|
||||
|
||||
notifications:
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
recursive-include include *.h
|
||||
include LICENSE
|
||||
include README.rst
|
||||
include bin/spacy
|
||||
|
|
|
@ -229,7 +229,7 @@ Compile from source
|
|||
The other way to install spaCy is to clone its
|
||||
`GitHub repository <https://github.com/explosion/spaCy>`_ and build it from
|
||||
source. That is the common way if you want to make changes to the code base.
|
||||
You'll need to make sure that you have a development enviroment consisting of a
|
||||
You'll need to make sure that you have a development environment consisting of a
|
||||
Python distribution including header files, a compiler,
|
||||
`pip <https://pip.pypa.io/en/latest/installing/>`__, `virtualenv <https://virtualenv.pypa.io/>`_
|
||||
and `git <https://git-scm.com>`_ installed. The compiler part is the trickiest.
|
||||
|
|
|
@ -1,322 +0,0 @@
|
|||
'''WIP --- Doesn't work well yet'''
|
||||
import plac
|
||||
import random
|
||||
import six
|
||||
|
||||
import cProfile
|
||||
import pstats
|
||||
|
||||
import pathlib
|
||||
import cPickle as pickle
|
||||
from itertools import izip
|
||||
|
||||
import spacy
|
||||
|
||||
import cytoolz
|
||||
import cupy as xp
|
||||
import cupy.cuda
|
||||
import chainer.cuda
|
||||
|
||||
import chainer.links as L
|
||||
import chainer.functions as F
|
||||
from chainer import Chain, Variable, report
|
||||
import chainer.training
|
||||
import chainer.optimizers
|
||||
from chainer.training import extensions
|
||||
from chainer.iterators import SerialIterator
|
||||
from chainer.datasets import TupleDataset
|
||||
|
||||
|
||||
class SentimentAnalyser(object):
|
||||
@classmethod
|
||||
def load(cls, path, nlp, max_length=100):
|
||||
raise NotImplementedError
|
||||
#with (path / 'config.json').open() as file_:
|
||||
# model = model_from_json(file_.read())
|
||||
#with (path / 'model').open('rb') as file_:
|
||||
# lstm_weights = pickle.load(file_)
|
||||
#embeddings = get_embeddings(nlp.vocab)
|
||||
#model.set_weights([embeddings] + lstm_weights)
|
||||
#return cls(model, max_length=max_length)
|
||||
|
||||
def __init__(self, model, max_length=100):
|
||||
self._model = model
|
||||
self.max_length = max_length
|
||||
|
||||
def __call__(self, doc):
|
||||
X = get_features([doc], self.max_length)
|
||||
y = self._model.predict(X)
|
||||
self.set_sentiment(doc, y)
|
||||
|
||||
def pipe(self, docs, batch_size=1000, n_threads=2):
|
||||
for minibatch in cytoolz.partition_all(batch_size, docs):
|
||||
minibatch = list(minibatch)
|
||||
sentences = []
|
||||
for doc in minibatch:
|
||||
sentences.extend(doc.sents)
|
||||
Xs = get_features(sentences, self.max_length)
|
||||
ys = self._model.predict(Xs)
|
||||
for sent, label in zip(sentences, ys):
|
||||
sent.doc.sentiment += label - 0.5
|
||||
for doc in minibatch:
|
||||
yield doc
|
||||
|
||||
def set_sentiment(self, doc, y):
|
||||
doc.sentiment = float(y[0])
|
||||
# Sentiment has a native slot for a single float.
|
||||
# For arbitrary data storage, there's:
|
||||
# doc.user_data['my_data'] = y
|
||||
|
||||
|
||||
class Classifier(Chain):
|
||||
def __init__(self, predictor):
|
||||
super(Classifier, self).__init__(predictor=predictor)
|
||||
|
||||
def __call__(self, x, t):
|
||||
y = self.predictor(x)
|
||||
loss = F.softmax_cross_entropy(y, t)
|
||||
accuracy = F.accuracy(y, t)
|
||||
report({'loss': loss, 'accuracy': accuracy}, self)
|
||||
return loss
|
||||
|
||||
|
||||
class SentimentModel(Chain):
|
||||
def __init__(self, nlp, shape, **settings):
|
||||
Chain.__init__(self,
|
||||
embed=_Embed(shape['nr_vector'], shape['nr_dim'], shape['nr_hidden'],
|
||||
set_vectors=lambda arr: set_vectors(arr, nlp.vocab)),
|
||||
encode=_Encode(shape['nr_hidden'], shape['nr_hidden']),
|
||||
attend=_Attend(shape['nr_hidden'], shape['nr_hidden']),
|
||||
predict=_Predict(shape['nr_hidden'], shape['nr_class']))
|
||||
self.to_gpu(0)
|
||||
|
||||
def __call__(self, sentence):
|
||||
return self.predict(
|
||||
self.attend(
|
||||
self.encode(
|
||||
self.embed(sentence))))
|
||||
|
||||
|
||||
class _Embed(Chain):
|
||||
def __init__(self, nr_vector, nr_dim, nr_out, set_vectors=None):
|
||||
Chain.__init__(self,
|
||||
embed=L.EmbedID(nr_vector, nr_dim, initialW=set_vectors),
|
||||
project=L.Linear(None, nr_out, nobias=True))
|
||||
self.embed.W.volatile = False
|
||||
|
||||
def __call__(self, sentence):
|
||||
return [self.project(self.embed(ts)) for ts in F.transpose(sentence)]
|
||||
|
||||
|
||||
class _Encode(Chain):
|
||||
def __init__(self, nr_in, nr_out):
|
||||
Chain.__init__(self,
|
||||
fwd=L.LSTM(nr_in, nr_out),
|
||||
bwd=L.LSTM(nr_in, nr_out),
|
||||
mix=L.Bilinear(nr_out, nr_out, nr_out))
|
||||
|
||||
def __call__(self, sentence):
|
||||
self.fwd.reset_state()
|
||||
fwds = map(self.fwd, sentence)
|
||||
self.bwd.reset_state()
|
||||
bwds = reversed(map(self.bwd, reversed(sentence)))
|
||||
return [F.elu(self.mix(f, b)) for f, b in zip(fwds, bwds)]
|
||||
|
||||
|
||||
class _Attend(Chain):
|
||||
def __init__(self, nr_in, nr_out):
|
||||
Chain.__init__(self)
|
||||
|
||||
def __call__(self, sentence):
|
||||
sent = sum(sentence)
|
||||
return sent
|
||||
|
||||
|
||||
class _Predict(Chain):
|
||||
def __init__(self, nr_in, nr_out):
|
||||
Chain.__init__(self,
|
||||
l1=L.Linear(nr_in, nr_in),
|
||||
l2=L.Linear(nr_in, nr_out))
|
||||
|
||||
def __call__(self, vector):
|
||||
vector = self.l1(vector)
|
||||
vector = F.elu(vector)
|
||||
vector = self.l2(vector)
|
||||
return vector
|
||||
|
||||
|
||||
class SentenceDataset(TupleDataset):
|
||||
def __init__(self, nlp, texts, labels, max_length):
|
||||
self.max_length = max_length
|
||||
sents, labels = self._get_labelled_sentences(
|
||||
nlp.pipe(texts, batch_size=5000, n_threads=3),
|
||||
labels)
|
||||
TupleDataset.__init__(self,
|
||||
get_features(sents, max_length),
|
||||
labels)
|
||||
|
||||
def __getitem__(self, index):
|
||||
batches = [dataset[index] for dataset in self._datasets]
|
||||
if isinstance(index, slice):
|
||||
length = len(batches[0])
|
||||
returns = [tuple([batch[i] for batch in batches])
|
||||
for i in six.moves.range(length)]
|
||||
return returns
|
||||
else:
|
||||
return tuple(batches)
|
||||
|
||||
def _get_labelled_sentences(self, docs, doc_labels):
|
||||
labels = []
|
||||
sentences = []
|
||||
for doc, y in izip(docs, doc_labels):
|
||||
for sent in doc.sents:
|
||||
sentences.append(sent)
|
||||
labels.append(y)
|
||||
return sentences, xp.asarray(labels, dtype='i')
|
||||
|
||||
|
||||
class DocDataset(TupleDataset):
|
||||
def __init__(self, nlp, texts, labels):
|
||||
self.max_length = max_length
|
||||
DatasetMixin.__init__(self,
|
||||
get_features(
|
||||
nlp.pipe(texts, batch_size=5000, n_threads=3), self.max_length),
|
||||
labels)
|
||||
|
||||
def read_data(data_dir, limit=0):
|
||||
examples = []
|
||||
for subdir, label in (('pos', 1), ('neg', 0)):
|
||||
for filename in (data_dir / subdir).iterdir():
|
||||
with filename.open() as file_:
|
||||
text = file_.read()
|
||||
examples.append((text, label))
|
||||
random.shuffle(examples)
|
||||
if limit >= 1:
|
||||
examples = examples[:limit]
|
||||
return zip(*examples) # Unzips into two lists
|
||||
|
||||
|
||||
def get_features(docs, max_length):
|
||||
docs = list(docs)
|
||||
Xs = xp.zeros((len(docs), max_length), dtype='i')
|
||||
for i, doc in enumerate(docs):
|
||||
j = 0
|
||||
for token in doc:
|
||||
if token.has_vector and not token.is_punct and not token.is_space:
|
||||
Xs[i, j] = token.norm
|
||||
j += 1
|
||||
if j >= max_length:
|
||||
break
|
||||
return Xs
|
||||
|
||||
|
||||
def set_vectors(vectors, vocab):
|
||||
for lex in vocab:
|
||||
if lex.has_vector and (lex.rank+1) < vectors.shape[0]:
|
||||
lex.norm = lex.rank+1
|
||||
vectors[lex.rank + 1] = lex.vector
|
||||
else:
|
||||
lex.norm = 0
|
||||
return vectors
|
||||
|
||||
|
||||
def train(train_texts, train_labels, dev_texts, dev_labels,
|
||||
lstm_shape, lstm_settings, lstm_optimizer, batch_size=100, nb_epoch=5,
|
||||
by_sentence=True):
|
||||
nlp = spacy.load('en', entity=False)
|
||||
if 'nr_vector' not in lstm_shape:
|
||||
lstm_shape['nr_vector'] = max(lex.rank+1 for lex in nlp.vocab if lex.has_vector)
|
||||
if 'nr_dim' not in lstm_shape:
|
||||
lstm_shape['nr_dim'] = nlp.vocab.vectors_length
|
||||
print("Make model")
|
||||
model = Classifier(SentimentModel(nlp, lstm_shape, **lstm_settings))
|
||||
print("Parsing texts...")
|
||||
if by_sentence:
|
||||
train_data = SentenceDataset(nlp, train_texts, train_labels, lstm_shape['max_length'])
|
||||
dev_data = SentenceDataset(nlp, dev_texts, dev_labels, lstm_shape['max_length'])
|
||||
else:
|
||||
train_data = DocDataset(nlp, train_texts, train_labels)
|
||||
dev_data = DocDataset(nlp, dev_texts, dev_labels)
|
||||
train_iter = SerialIterator(train_data, batch_size=batch_size,
|
||||
shuffle=True, repeat=True)
|
||||
dev_iter = SerialIterator(dev_data, batch_size=batch_size,
|
||||
shuffle=False, repeat=False)
|
||||
optimizer = chainer.optimizers.Adam()
|
||||
optimizer.setup(model)
|
||||
updater = chainer.training.StandardUpdater(train_iter, optimizer, device=0)
|
||||
trainer = chainer.training.Trainer(updater, (1, 'epoch'), out='result')
|
||||
|
||||
trainer.extend(extensions.Evaluator(dev_iter, model, device=0))
|
||||
trainer.extend(extensions.LogReport())
|
||||
trainer.extend(extensions.PrintReport([
|
||||
'epoch', 'main/accuracy', 'validation/main/accuracy']))
|
||||
trainer.extend(extensions.ProgressBar())
|
||||
|
||||
trainer.run()
|
||||
|
||||
|
||||
def evaluate(model_dir, texts, labels, max_length=100):
|
||||
def create_pipeline(nlp):
|
||||
'''
|
||||
This could be a lambda, but named functions are easier to read in Python.
|
||||
'''
|
||||
return [nlp.tagger, nlp.parser, SentimentAnalyser.load(model_dir, nlp,
|
||||
max_length=max_length)]
|
||||
|
||||
nlp = spacy.load('en')
|
||||
nlp.pipeline = create_pipeline(nlp)
|
||||
|
||||
correct = 0
|
||||
i = 0
|
||||
for doc in nlp.pipe(texts, batch_size=1000, n_threads=4):
|
||||
correct += bool(doc.sentiment >= 0.5) == bool(labels[i])
|
||||
i += 1
|
||||
return float(correct) / i
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
train_dir=("Location of training file or directory"),
|
||||
dev_dir=("Location of development file or directory"),
|
||||
model_dir=("Location of output model directory",),
|
||||
is_runtime=("Demonstrate run-time usage", "flag", "r", bool),
|
||||
nr_hidden=("Number of hidden units", "option", "H", int),
|
||||
max_length=("Maximum sentence length", "option", "L", int),
|
||||
dropout=("Dropout", "option", "d", float),
|
||||
learn_rate=("Learn rate", "option", "e", float),
|
||||
nb_epoch=("Number of training epochs", "option", "i", int),
|
||||
batch_size=("Size of minibatches for training LSTM", "option", "b", int),
|
||||
nr_examples=("Limit to N examples", "option", "n", int)
|
||||
)
|
||||
def main(model_dir, train_dir, dev_dir,
|
||||
is_runtime=False,
|
||||
nr_hidden=64, max_length=100, # Shape
|
||||
dropout=0.5, learn_rate=0.001, # General NN config
|
||||
nb_epoch=5, batch_size=32, nr_examples=-1): # Training params
|
||||
model_dir = pathlib.Path(model_dir)
|
||||
train_dir = pathlib.Path(train_dir)
|
||||
dev_dir = pathlib.Path(dev_dir)
|
||||
if is_runtime:
|
||||
dev_texts, dev_labels = read_data(dev_dir)
|
||||
acc = evaluate(model_dir, dev_texts, dev_labels, max_length=max_length)
|
||||
print(acc)
|
||||
else:
|
||||
print("Read data")
|
||||
train_texts, train_labels = read_data(train_dir, limit=nr_examples)
|
||||
dev_texts, dev_labels = read_data(dev_dir, limit=nr_examples)
|
||||
print("Using GPU 0")
|
||||
#chainer.cuda.get_device(0).use()
|
||||
train_labels = xp.asarray(train_labels, dtype='i')
|
||||
dev_labels = xp.asarray(dev_labels, dtype='i')
|
||||
lstm = train(train_texts, train_labels, dev_texts, dev_labels,
|
||||
{'nr_hidden': nr_hidden, 'max_length': max_length, 'nr_class': 2,
|
||||
'nr_vector': 5000},
|
||||
{'dropout': 0.5, 'lr': learn_rate},
|
||||
{},
|
||||
nb_epoch=nb_epoch, batch_size=batch_size)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
#cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof")
|
||||
#s = pstats.Stats("Profile.prof")
|
||||
#s.strip_dirs().sort_stats("time").print_stats()
|
||||
plac.call(main)
|
|
@ -1,281 +0,0 @@
|
|||
"""This script expects something like a binary sentiment data set, such as
|
||||
that available here: `http://www.cs.cornell.edu/people/pabo/movie-review-data/`
|
||||
|
||||
It expects a directory structure like: `data_dir/train/{pos|neg}`
|
||||
and `data_dir/test/{pos|neg}`. Put (say) 90% of the files in the former
|
||||
and the remainder in the latter.
|
||||
"""
|
||||
|
||||
from __future__ import unicode_literals
|
||||
from __future__ import print_function
|
||||
from __future__ import division
|
||||
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
import numpy
|
||||
import plac
|
||||
|
||||
import spacy.en
|
||||
|
||||
|
||||
def read_data(nlp, data_dir):
|
||||
for subdir, label in (('pos', 1), ('neg', 0)):
|
||||
for filename in (data_dir / subdir).iterdir():
|
||||
text = filename.open().read()
|
||||
doc = nlp(text)
|
||||
if len(doc) >= 1:
|
||||
yield doc, label
|
||||
|
||||
|
||||
def partition(examples, split_size):
|
||||
examples = list(examples)
|
||||
numpy.random.shuffle(examples)
|
||||
n_docs = len(examples)
|
||||
split = int(n_docs * split_size)
|
||||
return examples[:split], examples[split:]
|
||||
|
||||
|
||||
def minibatch(data, bs=24):
|
||||
for i in range(0, len(data), bs):
|
||||
yield data[i:i+bs]
|
||||
|
||||
|
||||
class Extractor(object):
|
||||
def __init__(self, nlp, vector_length, dropout=0.3):
|
||||
self.nlp = nlp
|
||||
self.dropout = dropout
|
||||
self.vector = numpy.zeros((vector_length, ))
|
||||
|
||||
def doc2bow(self, doc, dropout=None):
|
||||
if dropout is None:
|
||||
dropout = self.dropout
|
||||
bow = defaultdict(int)
|
||||
all_words = defaultdict(int)
|
||||
for word in doc:
|
||||
if numpy.random.random() >= dropout and not word.is_punct:
|
||||
bow[word.lower] += 1
|
||||
all_words[word.lower] += 1
|
||||
if sum(bow.values()) >= 1:
|
||||
return bow
|
||||
else:
|
||||
return all_words
|
||||
|
||||
def bow2vec(self, bow, E):
|
||||
self.vector.fill(0)
|
||||
n = 0
|
||||
for orth_id, freq in bow.items():
|
||||
self.vector += self.nlp.vocab[self.nlp.vocab.strings[orth_id]].vector * freq
|
||||
# Apply the fine-tuning we've learned
|
||||
if orth_id < E.shape[0]:
|
||||
self.vector += E[orth_id] * freq
|
||||
n += freq
|
||||
return self.vector / n
|
||||
|
||||
|
||||
class NeuralNetwork(object):
|
||||
def __init__(self, depth, width, n_classes, n_vocab, extracter, optimizer):
|
||||
self.depth = depth
|
||||
self.width = width
|
||||
self.n_classes = n_classes
|
||||
self.weights = Params.random(depth, width, width, n_classes, n_vocab)
|
||||
self.doc2bow = extracter.doc2bow
|
||||
self.bow2vec = extracter.bow2vec
|
||||
self.optimizer = optimizer
|
||||
self._gradient = Params.zero(depth, width, width, n_classes, n_vocab)
|
||||
self._activity = numpy.zeros((depth, width))
|
||||
|
||||
def train(self, batch):
|
||||
activity = self._activity
|
||||
gradient = self._gradient
|
||||
activity.fill(0)
|
||||
gradient.data.fill(0)
|
||||
loss = 0
|
||||
word_freqs = defaultdict(int)
|
||||
for doc, label in batch:
|
||||
word_ids = self.doc2bow(doc)
|
||||
vector = self.bow2vec(word_ids, self.weights.E)
|
||||
self.forward(activity, vector)
|
||||
loss += self.backprop(vector, gradient, activity, word_ids, label)
|
||||
for w, freq in word_ids.items():
|
||||
word_freqs[w] += freq
|
||||
self.optimizer(self.weights, gradient, len(batch), word_freqs)
|
||||
return loss
|
||||
|
||||
def predict(self, doc):
|
||||
actv = self._activity
|
||||
actv.fill(0)
|
||||
W = self.weights.W
|
||||
b = self.weights.b
|
||||
E = self.weights.E
|
||||
|
||||
vector = self.bow2vec(self.doc2bow(doc, dropout=0.0), E)
|
||||
self.forward(actv, vector)
|
||||
return numpy.argmax(softmax(actv[-1], W[-1], b[-1]))
|
||||
|
||||
def forward(self, actv, in_):
|
||||
actv.fill(0)
|
||||
W = self.weights.W; b = self.weights.b
|
||||
actv[0] = relu(in_, W[0], b[0])
|
||||
for i in range(1, self.depth):
|
||||
actv[i] = relu(actv[i-1], W[i], b[i])
|
||||
|
||||
def backprop(self, input_vector, gradient, activity, ids, label):
|
||||
W = self.weights.W
|
||||
b = self.weights.b
|
||||
|
||||
target = numpy.zeros(self.n_classes)
|
||||
target[label] = 1.0
|
||||
pred = softmax(activity[-1], W[-1], b[-1])
|
||||
delta = pred - target
|
||||
|
||||
for i in range(self.depth, 0, -1):
|
||||
gradient.b[i] += delta
|
||||
gradient.W[i] += numpy.outer(delta, activity[i-1])
|
||||
delta = d_relu(activity[i-1]) * W[i].T.dot(delta)
|
||||
|
||||
gradient.b[0] += delta
|
||||
gradient.W[0] += numpy.outer(delta, input_vector)
|
||||
tuning = W[0].T.dot(delta).reshape((self.width,)) / len(ids)
|
||||
for w, freq in ids.items():
|
||||
if w < gradient.E.shape[0]:
|
||||
gradient.E[w] += tuning * freq
|
||||
return -sum(target * numpy.log(pred))
|
||||
|
||||
|
||||
def softmax(actvn, W, b):
|
||||
w = W.dot(actvn) + b
|
||||
ew = numpy.exp(w - max(w))
|
||||
return (ew / sum(ew)).ravel()
|
||||
|
||||
|
||||
def relu(actvn, W, b):
|
||||
x = W.dot(actvn) + b
|
||||
return x * (x > 0)
|
||||
|
||||
|
||||
def d_relu(x):
|
||||
return x > 0
|
||||
|
||||
|
||||
class Adagrad(object):
|
||||
def __init__(self, lr, rho):
|
||||
self.eps = 1e-3
|
||||
# initial learning rate
|
||||
self.learning_rate = lr
|
||||
self.rho = rho
|
||||
# stores sum of squared gradients
|
||||
#self.h = numpy.zeros(self.dim)
|
||||
#self._curr_rate = numpy.zeros(self.h.shape)
|
||||
self.h = None
|
||||
self._curr_rate = None
|
||||
|
||||
def __call__(self, weights, gradient, batch_size, word_freqs):
|
||||
if self.h is None:
|
||||
self.h = numpy.zeros(gradient.data.shape)
|
||||
self._curr_rate = numpy.zeros(gradient.data.shape)
|
||||
self.L2_penalty(gradient, weights, word_freqs)
|
||||
update = self.rescale(gradient.data / batch_size)
|
||||
weights.data -= update
|
||||
|
||||
def rescale(self, gradient):
|
||||
if self.h is None:
|
||||
self.h = numpy.zeros(gradient.data.shape)
|
||||
self._curr_rate = numpy.zeros(gradient.data.shape)
|
||||
self._curr_rate.fill(0)
|
||||
self.h += gradient ** 2
|
||||
self._curr_rate = self.learning_rate / (numpy.sqrt(self.h) + self.eps)
|
||||
return self._curr_rate * gradient
|
||||
|
||||
def L2_penalty(self, gradient, weights, word_freqs):
|
||||
# L2 Regularization
|
||||
for i in range(len(weights.W)):
|
||||
gradient.W[i] += weights.W[i] * self.rho
|
||||
gradient.b[i] += weights.b[i] * self.rho
|
||||
for w, freq in word_freqs.items():
|
||||
if w < gradient.E.shape[0]:
|
||||
gradient.E[w] += weights.E[w] * self.rho
|
||||
|
||||
|
||||
class Params(object):
|
||||
@classmethod
|
||||
def zero(cls, depth, n_embed, n_hidden, n_labels, n_vocab):
|
||||
return cls(depth, n_embed, n_hidden, n_labels, n_vocab, lambda x: numpy.zeros((x,)))
|
||||
|
||||
@classmethod
|
||||
def random(cls, depth, nE, nH, nL, nV):
|
||||
return cls(depth, nE, nH, nL, nV, lambda x: (numpy.random.rand(x) * 2 - 1) * 0.08)
|
||||
|
||||
def __init__(self, depth, n_embed, n_hidden, n_labels, n_vocab, initializer):
|
||||
nE = n_embed; nH = n_hidden; nL = n_labels; nV = n_vocab
|
||||
n_weights = sum([
|
||||
(nE * nH) + nH,
|
||||
(nH * nH + nH) * depth,
|
||||
(nH * nL) + nL,
|
||||
(nV * nE)
|
||||
])
|
||||
self.data = initializer(n_weights)
|
||||
self.W = []
|
||||
self.b = []
|
||||
i = self._add_layer(0, nE, nH)
|
||||
for _ in range(1, depth):
|
||||
i = self._add_layer(i, nH, nH)
|
||||
i = self._add_layer(i, nL, nH)
|
||||
self.E = self.data[i : i + (nV * nE)].reshape((nV, nE))
|
||||
self.E.fill(0)
|
||||
|
||||
def _add_layer(self, start, x, y):
|
||||
end = start + (x * y)
|
||||
self.W.append(self.data[start : end].reshape((x, y)))
|
||||
self.b.append(self.data[end : end + x].reshape((x, )))
|
||||
return end + x
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
data_dir=("Data directory", "positional", None, Path),
|
||||
n_iter=("Number of iterations (epochs)", "option", "i", int),
|
||||
width=("Size of hidden layers", "option", "H", int),
|
||||
depth=("Depth", "option", "d", int),
|
||||
dropout=("Drop-out rate", "option", "r", float),
|
||||
rho=("Regularization penalty", "option", "p", float),
|
||||
eta=("Learning rate", "option", "e", float),
|
||||
batch_size=("Batch size", "option", "b", int),
|
||||
vocab_size=("Number of words to fine-tune", "option", "w", int),
|
||||
)
|
||||
def main(data_dir, depth=3, width=300, n_iter=5, vocab_size=40000,
|
||||
batch_size=24, dropout=0.3, rho=1e-5, eta=0.005):
|
||||
n_classes = 2
|
||||
print("Loading")
|
||||
nlp = spacy.en.English(parser=False)
|
||||
train_data, dev_data = partition(read_data(nlp, data_dir / 'train'), 0.8)
|
||||
print("Begin training")
|
||||
extracter = Extractor(nlp, width, dropout=0.3)
|
||||
optimizer = Adagrad(eta, rho)
|
||||
model = NeuralNetwork(depth, width, n_classes, vocab_size, extracter, optimizer)
|
||||
prev_best = 0
|
||||
best_weights = None
|
||||
for epoch in range(n_iter):
|
||||
numpy.random.shuffle(train_data)
|
||||
train_loss = 0.0
|
||||
for batch in minibatch(train_data, bs=batch_size):
|
||||
train_loss += model.train(batch)
|
||||
n_correct = sum(model.predict(x) == y for x, y in dev_data)
|
||||
print(epoch, train_loss, n_correct / len(dev_data))
|
||||
if n_correct >= prev_best:
|
||||
best_weights = model.weights.data.copy()
|
||||
prev_best = n_correct
|
||||
|
||||
model.weights.data = best_weights
|
||||
print("Evaluating")
|
||||
eval_data = list(read_data(nlp, data_dir / 'test'))
|
||||
n_correct = sum(model.predict(x) == y for x, y in eval_data)
|
||||
print(n_correct / len(eval_data))
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
#import cProfile
|
||||
#import pstats
|
||||
#cProfile.runctx("main(Path('data/aclImdb'))", globals(), locals(), "Profile.prof")
|
||||
#s = pstats.Stats("Profile.prof")
|
||||
#s.strip_dirs().sort_stats("time").print_stats(100)
|
||||
plac.call(main)
|
|
@ -20,72 +20,72 @@ The algorithm is O(n) at run-time for document of length n because we're only ev
|
|||
matching over the tag patterns. So no matter how many phrases we're looking for,
|
||||
our pattern set stays very small (exact size depends on the maximum length we're
|
||||
looking for, as the query language currently has no quantifiers)
|
||||
|
||||
The example expects a .bz2 file from the Reddit corpus, and a patterns file,
|
||||
formatted in jsonl as a sequence of entries like this:
|
||||
|
||||
{"text":"Anchorage"}
|
||||
{"text":"Angola"}
|
||||
{"text":"Ann Arbor"}
|
||||
{"text":"Annapolis"}
|
||||
{"text":"Appalachia"}
|
||||
{"text":"Argentina"}
|
||||
"""
|
||||
from __future__ import print_function, unicode_literals, division
|
||||
from ast import literal_eval
|
||||
from bz2 import BZ2File
|
||||
import time
|
||||
import math
|
||||
import codecs
|
||||
|
||||
import plac
|
||||
import ujson
|
||||
|
||||
from preshed.maps import PreshMap
|
||||
from preshed.counter import PreshCounter
|
||||
from spacy.strings import hash_string
|
||||
from spacy.en import English
|
||||
from spacy.matcher import PhraseMatcher
|
||||
import spacy
|
||||
|
||||
|
||||
def read_gazetteer(tokenizer, loc, n=-1):
|
||||
for i, line in enumerate(open(loc)):
|
||||
phrase = literal_eval('u' + line.strip())
|
||||
if ' (' in phrase and phrase.endswith(')'):
|
||||
phrase = phrase.split(' (', 1)[0]
|
||||
if i >= n:
|
||||
break
|
||||
phrase = tokenizer(phrase)
|
||||
if all((t.is_lower and t.prob >= -10) for t in phrase):
|
||||
continue
|
||||
data = ujson.loads(line.strip())
|
||||
phrase = tokenizer(data['text'])
|
||||
for w in phrase:
|
||||
_ = tokenizer.vocab[w.text]
|
||||
if len(phrase) >= 2:
|
||||
yield phrase
|
||||
|
||||
|
||||
def read_text(bz2_loc):
|
||||
def read_text(bz2_loc, n=10000):
|
||||
with BZ2File(bz2_loc) as file_:
|
||||
for line in file_:
|
||||
yield line.decode('utf8')
|
||||
for i, line in enumerate(file_):
|
||||
data = ujson.loads(line)
|
||||
yield data['body']
|
||||
if i >= n:
|
||||
break
|
||||
|
||||
|
||||
def get_matches(tokenizer, phrases, texts, max_length=6):
|
||||
matcher = PhraseMatcher(tokenizer.vocab, phrases, max_length=max_length)
|
||||
print("Match")
|
||||
matcher = PhraseMatcher(tokenizer.vocab, max_length=max_length)
|
||||
matcher.add('Phrase', None, *phrases)
|
||||
for text in texts:
|
||||
doc = tokenizer(text)
|
||||
for w in doc:
|
||||
_ = doc.vocab[w.text]
|
||||
matches = matcher(doc)
|
||||
for mwe in doc.ents:
|
||||
yield mwe
|
||||
for ent_id, start, end in matches:
|
||||
yield (ent_id, doc[start:end].text)
|
||||
|
||||
|
||||
def main(patterns_loc, text_loc, counts_loc, n=10000000):
|
||||
nlp = English(parser=False, tagger=False, entity=False)
|
||||
print("Make matcher")
|
||||
phrases = read_gazetteer(nlp.tokenizer, patterns_loc, n=n)
|
||||
counts = PreshCounter()
|
||||
def main(patterns_loc, text_loc, n=10000):
|
||||
nlp = spacy.blank('en')
|
||||
nlp.vocab.lex_attr_getters = {}
|
||||
phrases = read_gazetteer(nlp.tokenizer, patterns_loc)
|
||||
count = 0
|
||||
t1 = time.time()
|
||||
for mwe in get_matches(nlp.tokenizer, phrases, read_text(text_loc)):
|
||||
counts.inc(hash_string(mwe.text), 1)
|
||||
for ent_id, text in get_matches(nlp.tokenizer, phrases, read_text(text_loc, n=n)):
|
||||
count += 1
|
||||
t2 = time.time()
|
||||
print("10m tokens in %d s" % (t2 - t1))
|
||||
|
||||
with codecs.open(counts_loc, 'w', 'utf8') as file_:
|
||||
for phrase in read_gazetteer(nlp.tokenizer, patterns_loc, n=n):
|
||||
text = phrase.string
|
||||
key = hash_string(text)
|
||||
count = counts[key]
|
||||
if count != 0:
|
||||
file_.write('%d\t%s\n' % (count, text))
|
||||
|
||||
print("%d docs in %.3f s. %d matches" % (n, (t2 - t1), count))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if False:
|
52
examples/pipeline/custom_attr_methods.py
Normal file
52
examples/pipeline/custom_attr_methods.py
Normal file
|
@ -0,0 +1,52 @@
|
|||
# coding: utf-8
|
||||
"""This example contains several snippets of methods that can be set via custom
|
||||
Doc, Token or Span attributes in spaCy v2.0. Attribute methods act like
|
||||
they're "bound" to the object and are partially applied – i.e. the object
|
||||
they're called on is passed in as the first argument."""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.lang.en import English
|
||||
from spacy.tokens import Doc, Span
|
||||
from spacy import displacy
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def to_html(doc, output='/tmp', style='dep'):
|
||||
"""Doc method extension for saving the current state as a displaCy
|
||||
visualization.
|
||||
"""
|
||||
# generate filename from first six non-punct tokens
|
||||
file_name = '-'.join([w.text for w in doc[:6] if not w.is_punct]) + '.html'
|
||||
output_path = Path(output) / file_name
|
||||
html = displacy.render(doc, style=style, page=True) # render markup
|
||||
output_path.open('w', encoding='utf-8').write(html) # save to file
|
||||
print('Saved HTML to {}'.format(output_path))
|
||||
|
||||
|
||||
Doc.set_extension('to_html', method=to_html)
|
||||
|
||||
nlp = English()
|
||||
doc = nlp(u"This is a sentence about Apple.")
|
||||
# add entity manually for demo purposes, to make it work without a model
|
||||
doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])]
|
||||
doc._.to_html(style='ent')
|
||||
|
||||
|
||||
def overlap_tokens(doc, other_doc):
|
||||
"""Get the tokens from the original Doc that are also in the comparison Doc.
|
||||
"""
|
||||
overlap = []
|
||||
other_tokens = [token.text for token in other_doc]
|
||||
for token in doc:
|
||||
if token.text in other_tokens:
|
||||
overlap.append(token)
|
||||
return overlap
|
||||
|
||||
|
||||
Doc.set_extension('overlap', method=overlap_tokens)
|
||||
|
||||
nlp = English()
|
||||
doc1 = nlp(u"Peach emoji is where it has always been.")
|
||||
doc2 = nlp(u"Peach is the superior emoji.")
|
||||
tokens = doc1._.overlap(doc2)
|
||||
print(tokens)
|
108
examples/pipeline/custom_component_countries_api.py
Normal file
108
examples/pipeline/custom_component_countries_api.py
Normal file
|
@ -0,0 +1,108 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import requests
|
||||
|
||||
from spacy.lang.en import English
|
||||
from spacy.matcher import PhraseMatcher
|
||||
from spacy.tokens import Doc, Span, Token
|
||||
|
||||
|
||||
class RESTCountriesComponent(object):
|
||||
"""Example of a spaCy v2.0 pipeline component that requests all countries
|
||||
via the REST Countries API, merges country names into one token, assigns
|
||||
entity labels and sets attributes on country tokens, e.g. the capital and
|
||||
lat/lng coordinates. Can be extended with more details from the API.
|
||||
|
||||
REST Countries API: https://restcountries.eu
|
||||
API License: Mozilla Public License MPL 2.0
|
||||
"""
|
||||
name = 'rest_countries' # component name, will show up in the pipeline
|
||||
|
||||
def __init__(self, nlp, label='GPE'):
|
||||
"""Initialise the pipeline component. The shared nlp instance is used
|
||||
to initialise the matcher with the shared vocab, get the label ID and
|
||||
generate Doc objects as phrase match patterns.
|
||||
"""
|
||||
# Make request once on initialisation and store the data
|
||||
r = requests.get('https://restcountries.eu/rest/v2/all')
|
||||
r.raise_for_status() # make sure requests raises an error if it fails
|
||||
countries = r.json()
|
||||
|
||||
# Convert API response to dict keyed by country name for easy lookup
|
||||
# This could also be extended using the alternative and foreign language
|
||||
# names provided by the API
|
||||
self.countries = {c['name']: c for c in countries}
|
||||
self.label = nlp.vocab.strings[label] # get entity label ID
|
||||
|
||||
# Set up the PhraseMatcher with Doc patterns for each country name
|
||||
patterns = [nlp(c) for c in self.countries.keys()]
|
||||
self.matcher = PhraseMatcher(nlp.vocab)
|
||||
self.matcher.add('COUNTRIES', None, *patterns)
|
||||
|
||||
# Register attribute on the Token. We'll be overwriting this based on
|
||||
# the matches, so we're only setting a default value, not a getter.
|
||||
# If no default value is set, it defaults to None.
|
||||
Token.set_extension('is_country', default=False)
|
||||
Token.set_extension('country_capital')
|
||||
Token.set_extension('country_latlng')
|
||||
Token.set_extension('country_flag')
|
||||
|
||||
# Register attributes on Doc and Span via a getter that checks if one of
|
||||
# the contained tokens is set to is_country == True.
|
||||
Doc.set_extension('has_country', getter=self.has_country)
|
||||
Span.set_extension('has_country', getter=self.has_country)
|
||||
|
||||
|
||||
def __call__(self, doc):
|
||||
"""Apply the pipeline component on a Doc object and modify it if matches
|
||||
are found. Return the Doc, so it can be processed by the next component
|
||||
in the pipeline, if available.
|
||||
"""
|
||||
matches = self.matcher(doc)
|
||||
spans = [] # keep the spans for later so we can merge them afterwards
|
||||
for _, start, end in matches:
|
||||
# Generate Span representing the entity & set label
|
||||
entity = Span(doc, start, end, label=self.label)
|
||||
spans.append(entity)
|
||||
# Set custom attribute on each token of the entity
|
||||
# Can be extended with other data returned by the API, like
|
||||
# currencies, country code, flag, calling code etc.
|
||||
for token in entity:
|
||||
token._.set('is_country', True)
|
||||
token._.set('country_capital', self.countries[entity.text]['capital'])
|
||||
token._.set('country_latlng', self.countries[entity.text]['latlng'])
|
||||
token._.set('country_flag', self.countries[entity.text]['flag'])
|
||||
# Overwrite doc.ents and add entity – be careful not to replace!
|
||||
doc.ents = list(doc.ents) + [entity]
|
||||
for span in spans:
|
||||
# Iterate over all spans and merge them into one token. This is done
|
||||
# after setting the entities – otherwise, it would cause mismatched
|
||||
# indices!
|
||||
span.merge()
|
||||
return doc # don't forget to return the Doc!
|
||||
|
||||
def has_country(self, tokens):
|
||||
"""Getter for Doc and Span attributes. Returns True if one of the tokens
|
||||
is a country. Since the getter is only called when we access the
|
||||
attribute, we can refer to the Token's 'is_country' attribute here,
|
||||
which is already set in the processing step."""
|
||||
return any([t._.get('is_country') for t in tokens])
|
||||
|
||||
|
||||
# For simplicity, we start off with only the blank English Language class and
|
||||
# no model or pre-defined pipeline loaded.
|
||||
|
||||
nlp = English()
|
||||
rest_countries = RESTCountriesComponent(nlp) # initialise component
|
||||
nlp.add_pipe(rest_countries) # add it to the pipeline
|
||||
|
||||
doc = nlp(u"Some text about Colombia and the Czech Republic")
|
||||
|
||||
print('Pipeline', nlp.pipe_names) # pipeline contains component name
|
||||
print('Doc has countries', doc._.has_country) # Doc contains countries
|
||||
for token in doc:
|
||||
if token._.is_country:
|
||||
print(token.text, token._.country_capital, token._.country_latlng,
|
||||
token._.country_flag) # country data
|
||||
print('Entities', [(e.text, e.label_) for e in doc.ents]) # all countries are entities
|
85
examples/pipeline/custom_component_entities.py
Normal file
85
examples/pipeline/custom_component_entities.py
Normal file
|
@ -0,0 +1,85 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.lang.en import English
|
||||
from spacy.matcher import PhraseMatcher
|
||||
from spacy.tokens import Doc, Span, Token
|
||||
|
||||
|
||||
class TechCompanyRecognizer(object):
|
||||
"""Example of a spaCy v2.0 pipeline component that sets entity annotations
|
||||
based on list of single or multiple-word company names. Companies are
|
||||
labelled as ORG and their spans are merged into one token. Additionally,
|
||||
._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token
|
||||
respectively."""
|
||||
name = 'tech_companies' # component name, will show up in the pipeline
|
||||
|
||||
def __init__(self, nlp, companies=tuple(), label='ORG'):
|
||||
"""Initialise the pipeline component. The shared nlp instance is used
|
||||
to initialise the matcher with the shared vocab, get the label ID and
|
||||
generate Doc objects as phrase match patterns.
|
||||
"""
|
||||
self.label = nlp.vocab.strings[label] # get entity label ID
|
||||
|
||||
# Set up the PhraseMatcher – it can now take Doc objects as patterns,
|
||||
# so even if the list of companies is long, it's very efficient
|
||||
patterns = [nlp(org) for org in companies]
|
||||
self.matcher = PhraseMatcher(nlp.vocab)
|
||||
self.matcher.add('TECH_ORGS', None, *patterns)
|
||||
|
||||
# Register attribute on the Token. We'll be overwriting this based on
|
||||
# the matches, so we're only setting a default value, not a getter.
|
||||
Token.set_extension('is_tech_org', default=False)
|
||||
|
||||
# Register attributes on Doc and Span via a getter that checks if one of
|
||||
# the contained tokens is set to is_tech_org == True.
|
||||
Doc.set_extension('has_tech_org', getter=self.has_tech_org)
|
||||
Span.set_extension('has_tech_org', getter=self.has_tech_org)
|
||||
|
||||
def __call__(self, doc):
|
||||
"""Apply the pipeline component on a Doc object and modify it if matches
|
||||
are found. Return the Doc, so it can be processed by the next component
|
||||
in the pipeline, if available.
|
||||
"""
|
||||
matches = self.matcher(doc)
|
||||
spans = [] # keep the spans for later so we can merge them afterwards
|
||||
for _, start, end in matches:
|
||||
# Generate Span representing the entity & set label
|
||||
entity = Span(doc, start, end, label=self.label)
|
||||
spans.append(entity)
|
||||
# Set custom attribute on each token of the entity
|
||||
for token in entity:
|
||||
token._.set('is_tech_org', True)
|
||||
# Overwrite doc.ents and add entity – be careful not to replace!
|
||||
doc.ents = list(doc.ents) + [entity]
|
||||
for span in spans:
|
||||
# Iterate over all spans and merge them into one token. This is done
|
||||
# after setting the entities – otherwise, it would cause mismatched
|
||||
# indices!
|
||||
span.merge()
|
||||
return doc # don't forget to return the Doc!
|
||||
|
||||
def has_tech_org(self, tokens):
|
||||
"""Getter for Doc and Span attributes. Returns True if one of the tokens
|
||||
is a tech org. Since the getter is only called when we access the
|
||||
attribute, we can refer to the Token's 'is_tech_org' attribute here,
|
||||
which is already set in the processing step."""
|
||||
return any([t._.get('is_tech_org') for t in tokens])
|
||||
|
||||
|
||||
# For simplicity, we start off with only the blank English Language class and
|
||||
# no model or pre-defined pipeline loaded.
|
||||
|
||||
nlp = English()
|
||||
companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple'] # etc.
|
||||
component = TechCompanyRecognizer(nlp, companies) # initialise component
|
||||
nlp.add_pipe(component, last=True) # add it to the pipeline as the last element
|
||||
|
||||
doc = nlp(u"Alphabet Inc. is the company behind Google.")
|
||||
|
||||
print('Pipeline', nlp.pipe_names) # pipeline contains component name
|
||||
print('Tokens', [t.text for t in doc]) # company names from the list are merged
|
||||
print('Doc has_tech_org', doc._.has_tech_org) # Doc contains tech orgs
|
||||
print('Token 0 is_tech_org', doc[0]._.is_tech_org) # "Alphabet Inc." is a tech org
|
||||
print('Token 1 is_tech_org', doc[1]._.is_tech_org) # "is" is not
|
||||
print('Entities', [(e.text, e.label_) for e in doc.ents]) # all orgs are entities
|
|
@ -6,31 +6,36 @@ To achieve that, it duplicates some of spaCy's internal functionality.
|
|||
|
||||
Specifically, in this example, we don't use spaCy's built-in Language class to
|
||||
wire together the Vocab, Tokenizer and EntityRecognizer. Instead, we write
|
||||
our own simle Pipeline class, so that it's easier to see how the pieces
|
||||
our own simple Pipeline class, so that it's easier to see how the pieces
|
||||
interact.
|
||||
|
||||
Input data:
|
||||
https://www.lt.informatik.tu-darmstadt.de/fileadmin/user_upload/Group_LangTech/data/GermEval2014_complete_data.zip
|
||||
|
||||
Developed for: spaCy 1.7.1
|
||||
Last tested for: spaCy 1.7.1
|
||||
Last tested for: spaCy 2.0.0a13
|
||||
'''
|
||||
from __future__ import unicode_literals, print_function
|
||||
import plac
|
||||
from pathlib import Path
|
||||
import random
|
||||
import json
|
||||
import tqdm
|
||||
|
||||
from thinc.neural.optimizers import Adam
|
||||
from thinc.neural.ops import NumpyOps
|
||||
|
||||
import spacy.orth as orth_funcs
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.pipeline import BeamEntityRecognizer
|
||||
from spacy.pipeline import EntityRecognizer
|
||||
from spacy.pipeline import TokenVectorEncoder, NeuralEntityRecognizer
|
||||
from spacy.tokenizer import Tokenizer
|
||||
from spacy.tokens import Doc
|
||||
from spacy.attrs import *
|
||||
from spacy.gold import GoldParse
|
||||
from spacy.gold import _iob_to_biluo as iob_to_biluo
|
||||
from spacy.gold import iob_to_biluo
|
||||
from spacy.gold import minibatch
|
||||
from spacy.scorer import Scorer
|
||||
import spacy.util
|
||||
|
||||
|
||||
try:
|
||||
unicode
|
||||
|
@ -38,96 +43,38 @@ except NameError:
|
|||
unicode = str
|
||||
|
||||
|
||||
spacy.util.set_env_log(True)
|
||||
|
||||
|
||||
def init_vocab():
|
||||
return Vocab(
|
||||
lex_attr_getters={
|
||||
LOWER: lambda string: string.lower(),
|
||||
SHAPE: orth_funcs.word_shape,
|
||||
NORM: lambda string: string.lower(),
|
||||
PREFIX: lambda string: string[0],
|
||||
SUFFIX: lambda string: string[-3:],
|
||||
CLUSTER: lambda string: 0,
|
||||
IS_ALPHA: orth_funcs.is_alpha,
|
||||
IS_ASCII: orth_funcs.is_ascii,
|
||||
IS_DIGIT: lambda string: string.isdigit(),
|
||||
IS_LOWER: orth_funcs.is_lower,
|
||||
IS_PUNCT: orth_funcs.is_punct,
|
||||
IS_SPACE: lambda string: string.isspace(),
|
||||
IS_TITLE: orth_funcs.is_title,
|
||||
IS_UPPER: orth_funcs.is_upper,
|
||||
IS_STOP: lambda string: False,
|
||||
IS_OOV: lambda string: True
|
||||
})
|
||||
|
||||
|
||||
def save_vocab(vocab, path):
|
||||
path = Path(path)
|
||||
if not path.exists():
|
||||
path.mkdir()
|
||||
elif not path.is_dir():
|
||||
raise IOError("Can't save vocab to %s\nNot a directory" % path)
|
||||
with (path / 'strings.json').open('w') as file_:
|
||||
vocab.strings.dump(file_)
|
||||
vocab.dump((path / 'lexemes.bin').as_posix())
|
||||
|
||||
|
||||
def load_vocab(path):
|
||||
path = Path(path)
|
||||
if not path.exists():
|
||||
raise IOError("Cannot load vocab from %s\nDoes not exist" % path)
|
||||
if not path.is_dir():
|
||||
raise IOError("Cannot load vocab from %s\nNot a directory" % path)
|
||||
return Vocab.load(path)
|
||||
|
||||
|
||||
def init_ner_model(vocab, features=None):
|
||||
if features is None:
|
||||
features = tuple(EntityRecognizer.feature_templates)
|
||||
return EntityRecognizer(vocab, features=features)
|
||||
|
||||
|
||||
def save_ner_model(model, path):
|
||||
path = Path(path)
|
||||
if not path.exists():
|
||||
path.mkdir()
|
||||
if not path.is_dir():
|
||||
raise IOError("Can't save model to %s\nNot a directory" % path)
|
||||
model.model.dump((path / 'model').as_posix())
|
||||
with (path / 'config.json').open('w') as file_:
|
||||
data = json.dumps(model.cfg)
|
||||
if not isinstance(data, unicode):
|
||||
data = data.decode('utf8')
|
||||
file_.write(data)
|
||||
|
||||
|
||||
def load_ner_model(vocab, path):
|
||||
return EntityRecognizer.load(path, vocab)
|
||||
|
||||
|
||||
class Pipeline(object):
|
||||
@classmethod
|
||||
def load(cls, path):
|
||||
path = Path(path)
|
||||
if not path.exists():
|
||||
raise IOError("Cannot load pipeline from %s\nDoes not exist" % path)
|
||||
if not path.is_dir():
|
||||
raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
|
||||
vocab = load_vocab(path)
|
||||
tokenizer = Tokenizer(vocab, {}, None, None, None)
|
||||
ner_model = load_ner_model(vocab, path / 'ner')
|
||||
return cls(vocab, tokenizer, ner_model)
|
||||
|
||||
def __init__(self, vocab=None, tokenizer=None, entity=None):
|
||||
if vocab is None:
|
||||
vocab = init_vocab()
|
||||
if tokenizer is None:
|
||||
tokenizer = Tokenizer(vocab, {}, None, None, None)
|
||||
if entity is None:
|
||||
entity = init_ner_model(self.vocab)
|
||||
entity = NeuralEntityRecognizer(vocab)
|
||||
self.vocab = vocab
|
||||
self.tokenizer = tokenizer
|
||||
self.entity = entity
|
||||
self.pipeline = [self.entity]
|
||||
|
||||
def begin_training(self):
|
||||
for model in self.pipeline:
|
||||
model.begin_training([])
|
||||
optimizer = Adam(NumpyOps(), 0.001)
|
||||
return optimizer
|
||||
|
||||
def __call__(self, input_):
|
||||
doc = self.make_doc(input_)
|
||||
for process in self.pipeline:
|
||||
|
@ -147,14 +94,16 @@ class Pipeline(object):
|
|||
gold = GoldParse(doc, entities=annotations)
|
||||
return gold
|
||||
|
||||
def update(self, input_, annot):
|
||||
doc = self.make_doc(input_)
|
||||
gold = self.make_gold(input_, annot)
|
||||
for ner in gold.ner:
|
||||
if ner not in (None, '-', 'O'):
|
||||
action, label = ner.split('-', 1)
|
||||
self.entity.add_label(label)
|
||||
return self.entity.update(doc, gold)
|
||||
def update(self, inputs, annots, sgd, losses=None, drop=0.):
|
||||
if losses is None:
|
||||
losses = {}
|
||||
docs = [self.make_doc(input_) for input_ in inputs]
|
||||
golds = [self.make_gold(input_, annot) for input_, annot in
|
||||
zip(inputs, annots)]
|
||||
|
||||
self.entity.update(docs, golds, drop=drop,
|
||||
sgd=sgd, losses=losses)
|
||||
return losses
|
||||
|
||||
def evaluate(self, examples):
|
||||
scorer = Scorer()
|
||||
|
@ -164,43 +113,44 @@ class Pipeline(object):
|
|||
scorer.score(doc, gold)
|
||||
return scorer.scores
|
||||
|
||||
def average_weights(self):
|
||||
self.entity.model.end_training()
|
||||
|
||||
def save(self, path):
|
||||
def to_disk(self, path):
|
||||
path = Path(path)
|
||||
if not path.exists():
|
||||
path.mkdir()
|
||||
elif not path.is_dir():
|
||||
raise IOError("Can't save pipeline to %s\nNot a directory" % path)
|
||||
save_vocab(self.vocab, path / 'vocab')
|
||||
save_ner_model(self.entity, path / 'ner')
|
||||
self.vocab.to_disk(path / 'vocab')
|
||||
self.entity.to_disk(path / 'ner')
|
||||
|
||||
def from_disk(self, path):
|
||||
path = Path(path)
|
||||
if not path.exists():
|
||||
raise IOError("Cannot load pipeline from %s\nDoes not exist" % path)
|
||||
if not path.is_dir():
|
||||
raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
|
||||
self.vocab = self.vocab.from_disk(path / 'vocab')
|
||||
self.entity = self.entity.from_disk(path / 'ner')
|
||||
|
||||
|
||||
def train(nlp, train_examples, dev_examples, ctx, nr_epoch=5):
|
||||
next_epoch = train_examples
|
||||
def train(nlp, train_examples, dev_examples, nr_epoch=5):
|
||||
sgd = nlp.begin_training()
|
||||
print("Iter", "Loss", "P", "R", "F")
|
||||
for i in range(nr_epoch):
|
||||
this_epoch = next_epoch
|
||||
next_epoch = []
|
||||
loss = 0
|
||||
for input_, annot in this_epoch:
|
||||
loss += nlp.update(input_, annot)
|
||||
if (i+1) < nr_epoch:
|
||||
next_epoch.append((input_, annot))
|
||||
random.shuffle(next_epoch)
|
||||
random.shuffle(train_examples)
|
||||
losses = {}
|
||||
for batch in minibatch(tqdm.tqdm(train_examples, leave=False), size=8):
|
||||
inputs, annots = zip(*batch)
|
||||
nlp.update(list(inputs), list(annots), sgd, losses=losses)
|
||||
scores = nlp.evaluate(dev_examples)
|
||||
report_scores(i, loss, scores)
|
||||
nlp.average_weights()
|
||||
scores = nlp.evaluate(dev_examples)
|
||||
report_scores(channels, i+1, loss, scores)
|
||||
report_scores(i+1, losses['ner'], scores)
|
||||
|
||||
|
||||
def report_scores(i, loss, scores):
|
||||
precision = '%.2f' % scores['ents_p']
|
||||
recall = '%.2f' % scores['ents_r']
|
||||
f_measure = '%.2f' % scores['ents_f']
|
||||
print('%d %s %s %s' % (int(loss), precision, recall, f_measure))
|
||||
print('Epoch %d: %d %s %s %s' % (
|
||||
i, int(loss), precision, recall, f_measure))
|
||||
|
||||
|
||||
def read_examples(path):
|
||||
|
@ -208,7 +158,8 @@ def read_examples(path):
|
|||
with path.open() as file_:
|
||||
sents = file_.read().strip().split('\n\n')
|
||||
for sent in sents:
|
||||
if not sent.strip():
|
||||
sent = sent.strip()
|
||||
if not sent:
|
||||
continue
|
||||
tokens = sent.split('\n')
|
||||
while tokens and tokens[0].startswith('#'):
|
||||
|
@ -217,28 +168,39 @@ def read_examples(path):
|
|||
iob = []
|
||||
for token in tokens:
|
||||
if token.strip():
|
||||
pieces = token.split()
|
||||
pieces = token.split('\t')
|
||||
words.append(pieces[1])
|
||||
iob.append(pieces[2])
|
||||
yield words, iob_to_biluo(iob)
|
||||
|
||||
|
||||
def get_labels(examples):
|
||||
labels = set()
|
||||
for words, tags in examples:
|
||||
for tag in tags:
|
||||
if '-' in tag:
|
||||
labels.add(tag.split('-')[1])
|
||||
return sorted(labels)
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
model_dir=("Path to save the model", "positional", None, Path),
|
||||
train_loc=("Path to your training data", "positional", None, Path),
|
||||
dev_loc=("Path to your development data", "positional", None, Path),
|
||||
)
|
||||
def main(model_dir=Path('/home/matt/repos/spaCy/spacy/data/de-1.0.0'),
|
||||
train_loc=None, dev_loc=None, nr_epoch=30):
|
||||
|
||||
train_examples = read_examples(train_loc)
|
||||
def main(model_dir, train_loc, dev_loc, nr_epoch=30):
|
||||
print(model_dir, train_loc, dev_loc)
|
||||
train_examples = list(read_examples(train_loc))
|
||||
dev_examples = read_examples(dev_loc)
|
||||
nlp = Pipeline.load(model_dir)
|
||||
nlp = Pipeline()
|
||||
for label in get_labels(train_examples):
|
||||
nlp.entity.add_label(label)
|
||||
print("Add label", label)
|
||||
|
||||
train(nlp, train_examples, list(dev_examples), ctx, nr_epoch)
|
||||
train(nlp, train_examples, list(dev_examples), nr_epoch)
|
||||
|
||||
nlp.save(model_dir)
|
||||
nlp.to_disk(model_dir)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
plac.call(main)
|
||||
|
|
|
@ -25,7 +25,7 @@ For more details, see the documentation:
|
|||
* Saving and loading models: https://spacy.io/docs/usage/saving-loading
|
||||
|
||||
Developed for: spaCy 1.7.6
|
||||
Last tested for: spaCy 1.7.6
|
||||
Last updated for: spaCy 2.0.0a13
|
||||
"""
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
|
@ -34,55 +34,40 @@ from pathlib import Path
|
|||
import random
|
||||
|
||||
import spacy
|
||||
from spacy.gold import GoldParse
|
||||
from spacy.tagger import Tagger
|
||||
from spacy.gold import GoldParse, minibatch
|
||||
from spacy.pipeline import NeuralEntityRecognizer
|
||||
from spacy.pipeline import TokenVectorEncoder
|
||||
|
||||
|
||||
def get_gold_parses(tokenizer, train_data):
|
||||
'''Shuffle and create GoldParse objects'''
|
||||
random.shuffle(train_data)
|
||||
for raw_text, entity_offsets in train_data:
|
||||
doc = tokenizer(raw_text)
|
||||
gold = GoldParse(doc, entities=entity_offsets)
|
||||
yield doc, gold
|
||||
|
||||
|
||||
def train_ner(nlp, train_data, output_dir):
|
||||
# Add new words to vocab
|
||||
for raw_text, _ in train_data:
|
||||
doc = nlp.make_doc(raw_text)
|
||||
for word in doc:
|
||||
_ = nlp.vocab[word.orth]
|
||||
random.seed(0)
|
||||
# You may need to change the learning rate. It's generally difficult to
|
||||
# guess what rate you should set, especially when you have limited data.
|
||||
nlp.entity.model.learn_rate = 0.001
|
||||
for itn in range(1000):
|
||||
random.shuffle(train_data)
|
||||
loss = 0.
|
||||
for raw_text, entity_offsets in train_data:
|
||||
gold = GoldParse(doc, entities=entity_offsets)
|
||||
# By default, the GoldParse class assumes that the entities
|
||||
# described by offset are complete, and all other words should
|
||||
# have the tag 'O'. You can tell it to make no assumptions
|
||||
# about the tag of a word by giving it the tag '-'.
|
||||
# However, this allows a trivial solution to the current
|
||||
# learning problem: if words are either 'any tag' or 'ANIMAL',
|
||||
# the model can learn that all words can be tagged 'ANIMAL'.
|
||||
#for i in range(len(gold.ner)):
|
||||
#if not gold.ner[i].endswith('ANIMAL'):
|
||||
# gold.ner[i] = '-'
|
||||
doc = nlp.make_doc(raw_text)
|
||||
nlp.tagger(doc)
|
||||
# As of 1.9, spaCy's parser now lets you supply a dropout probability
|
||||
# This might help the model generalize better from only a few
|
||||
# examples.
|
||||
loss += nlp.entity.update(doc, gold, drop=0.9)
|
||||
if loss == 0:
|
||||
break
|
||||
# This step averages the model's weights. This may or may not be good for
|
||||
# your situation --- it's empirical.
|
||||
nlp.end_training()
|
||||
if output_dir:
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
nlp.save_to_directory(output_dir)
|
||||
optimizer = nlp.begin_training(lambda: [])
|
||||
nlp.meta['name'] = 'en_ent_animal'
|
||||
for itn in range(50):
|
||||
losses = {}
|
||||
for batch in minibatch(get_gold_parses(nlp.make_doc, train_data), size=3):
|
||||
docs, golds = zip(*batch)
|
||||
nlp.update(docs, golds, losses=losses, sgd=optimizer, drop=0.35)
|
||||
print(losses)
|
||||
if not output_dir:
|
||||
return
|
||||
elif not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
nlp.to_disk(output_dir)
|
||||
|
||||
|
||||
def main(model_name, output_directory=None):
|
||||
print("Loading initial model", model_name)
|
||||
nlp = spacy.load(model_name)
|
||||
print("Creating initial model", model_name)
|
||||
nlp = spacy.blank(model_name)
|
||||
if output_directory is not None:
|
||||
output_directory = Path(output_directory)
|
||||
|
||||
|
@ -91,6 +76,11 @@ def main(model_name, output_directory=None):
|
|||
"Horses are too tall and they pretend to care about your feelings",
|
||||
[(0, 6, 'ANIMAL')],
|
||||
),
|
||||
(
|
||||
"Do they bite?",
|
||||
[],
|
||||
),
|
||||
|
||||
(
|
||||
"horses are too tall and they pretend to care about your feelings",
|
||||
[(0, 6, 'ANIMAL')]
|
||||
|
@ -109,18 +99,21 @@ def main(model_name, output_directory=None):
|
|||
)
|
||||
|
||||
]
|
||||
nlp.entity.add_label('ANIMAL')
|
||||
nlp.add_pipe(TokenVectorEncoder(nlp.vocab))
|
||||
ner = NeuralEntityRecognizer(nlp.vocab)
|
||||
ner.add_label('ANIMAL')
|
||||
nlp.add_pipe(ner)
|
||||
train_ner(nlp, train_data, output_directory)
|
||||
|
||||
# Test that the entity is recognized
|
||||
doc = nlp('Do you like horses?')
|
||||
text = 'Do you like horses?'
|
||||
print("Ents in 'Do you like horses?':")
|
||||
doc = nlp(text)
|
||||
for ent in doc.ents:
|
||||
print(ent.label_, ent.text)
|
||||
if output_directory:
|
||||
print("Loading from", output_directory)
|
||||
nlp2 = spacy.load('en', path=output_directory)
|
||||
nlp2.entity.add_label('ANIMAL')
|
||||
nlp2 = spacy.load(output_directory)
|
||||
doc2 = nlp2('Do you like horses?')
|
||||
for ent in doc2.ents:
|
||||
print(ent.label_, ent.text)
|
||||
|
|
121
examples/training/train_textcat.py
Normal file
121
examples/training/train_textcat.py
Normal file
|
@ -0,0 +1,121 @@
|
|||
'''Train a multi-label convolutional neural network text classifier,
|
||||
using the spacy.pipeline.TextCategorizer component. The model is then added
|
||||
to spacy.pipeline, and predictions are available at `doc.cats`.
|
||||
'''
|
||||
from __future__ import unicode_literals
|
||||
import plac
|
||||
import random
|
||||
import tqdm
|
||||
|
||||
from thinc.neural.optimizers import Adam
|
||||
from thinc.neural.ops import NumpyOps
|
||||
import thinc.extra.datasets
|
||||
|
||||
import spacy.lang.en
|
||||
from spacy.gold import GoldParse, minibatch
|
||||
from spacy.util import compounding
|
||||
from spacy.pipeline import TextCategorizer
|
||||
|
||||
# TODO: Remove this once we're not supporting models trained with thinc <6.9.0
|
||||
import thinc.neural._classes.layernorm
|
||||
thinc.neural._classes.layernorm.set_compat_six_eight(False)
|
||||
|
||||
|
||||
def train_textcat(tokenizer, textcat,
|
||||
train_texts, train_cats, dev_texts, dev_cats,
|
||||
n_iter=20):
|
||||
'''
|
||||
Train the TextCategorizer without associated pipeline.
|
||||
'''
|
||||
textcat.begin_training()
|
||||
optimizer = Adam(NumpyOps(), 0.001)
|
||||
train_docs = [tokenizer(text) for text in train_texts]
|
||||
train_gold = [GoldParse(doc, cats=cats) for doc, cats in
|
||||
zip(train_docs, train_cats)]
|
||||
train_data = list(zip(train_docs, train_gold))
|
||||
batch_sizes = compounding(4., 128., 1.001)
|
||||
for i in range(n_iter):
|
||||
losses = {}
|
||||
# Progress bar and minibatching
|
||||
batches = minibatch(tqdm.tqdm(train_data, leave=False), size=batch_sizes)
|
||||
for batch in batches:
|
||||
docs, golds = zip(*batch)
|
||||
textcat.update(docs, golds, sgd=optimizer, drop=0.2,
|
||||
losses=losses)
|
||||
with textcat.model.use_params(optimizer.averages):
|
||||
scores = evaluate(tokenizer, textcat, dev_texts, dev_cats)
|
||||
yield losses['textcat'], scores
|
||||
|
||||
|
||||
def evaluate(tokenizer, textcat, texts, cats):
|
||||
docs = (tokenizer(text) for text in texts)
|
||||
tp = 1e-8 # True positives
|
||||
fp = 1e-8 # False positives
|
||||
fn = 1e-8 # False negatives
|
||||
tn = 1e-8 # True negatives
|
||||
for i, doc in enumerate(textcat.pipe(docs)):
|
||||
gold = cats[i]
|
||||
for label, score in doc.cats.items():
|
||||
if label not in gold:
|
||||
continue
|
||||
if score >= 0.5 and gold[label] >= 0.5:
|
||||
tp += 1.
|
||||
elif score >= 0.5 and gold[label] < 0.5:
|
||||
fp += 1.
|
||||
elif score < 0.5 and gold[label] < 0.5:
|
||||
tn += 1
|
||||
elif score < 0.5 and gold[label] >= 0.5:
|
||||
fn += 1
|
||||
precis = tp / (tp + fp)
|
||||
recall = tp / (tp + fn)
|
||||
fscore = 2 * (precis * recall) / (precis + recall)
|
||||
return {'textcat_p': precis, 'textcat_r': recall, 'textcat_f': fscore}
|
||||
|
||||
|
||||
def load_data(limit=0):
|
||||
# Partition off part of the train data --- avoid running experiments
|
||||
# against test.
|
||||
train_data, _ = thinc.extra.datasets.imdb()
|
||||
|
||||
random.shuffle(train_data)
|
||||
train_data = train_data[-limit:]
|
||||
|
||||
texts, labels = zip(*train_data)
|
||||
cats = [{'POSITIVE': bool(y)} for y in labels]
|
||||
|
||||
split = int(len(train_data) * 0.8)
|
||||
|
||||
train_texts = texts[:split]
|
||||
train_cats = cats[:split]
|
||||
dev_texts = texts[split:]
|
||||
dev_cats = cats[split:]
|
||||
return (train_texts, train_cats), (dev_texts, dev_cats)
|
||||
|
||||
|
||||
def main(model_loc=None):
|
||||
nlp = spacy.lang.en.English()
|
||||
tokenizer = nlp.tokenizer
|
||||
textcat = TextCategorizer(tokenizer.vocab, labels=['POSITIVE'])
|
||||
|
||||
print("Load IMDB data")
|
||||
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=2000)
|
||||
|
||||
print("Itn.\tLoss\tP\tR\tF")
|
||||
progress = '{i:d} {loss:.3f} {textcat_p:.3f} {textcat_r:.3f} {textcat_f:.3f}'
|
||||
|
||||
for i, (loss, scores) in enumerate(train_textcat(tokenizer, textcat,
|
||||
train_texts, train_cats,
|
||||
dev_texts, dev_cats, n_iter=20)):
|
||||
print(progress.format(i=i, loss=loss, **scores))
|
||||
# How to save, load and use
|
||||
nlp.pipeline.append(textcat)
|
||||
if model_loc is not None:
|
||||
nlp.to_disk(model_loc)
|
||||
|
||||
nlp = spacy.load(model_loc)
|
||||
doc = nlp(u'This movie sucked!')
|
||||
print(doc.cats)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
30
examples/vectors_fast_text.py
Normal file
30
examples/vectors_fast_text.py
Normal file
|
@ -0,0 +1,30 @@
|
|||
'''Load vectors for a language trained using FastText
|
||||
|
||||
https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
|
||||
'''
|
||||
from __future__ import unicode_literals
|
||||
import plac
|
||||
import numpy
|
||||
|
||||
import spacy.language
|
||||
|
||||
|
||||
def main(vectors_loc):
|
||||
nlp = spacy.language.Language()
|
||||
|
||||
with open(vectors_loc, 'rb') as file_:
|
||||
header = file_.readline()
|
||||
nr_row, nr_dim = header.split()
|
||||
nlp.vocab.clear_vectors(int(nr_dim))
|
||||
for line in file_:
|
||||
line = line.decode('utf8')
|
||||
pieces = line.split()
|
||||
word = pieces[0]
|
||||
vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
|
||||
nlp.vocab.set_vector(word, vector)
|
||||
doc = nlp(u'class colspan')
|
||||
print(doc[0].similarity(doc[1]))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
5
fabfile.py
vendored
5
fabfile.py
vendored
|
@ -14,6 +14,7 @@ VENV_DIR = path.join(PWD, ENV)
|
|||
def env(lang='python2.7'):
|
||||
if path.exists(VENV_DIR):
|
||||
local('rm -rf {env}'.format(env=VENV_DIR))
|
||||
local('pip install virtualenv')
|
||||
local('python -m virtualenv -p {lang} {env}'.format(lang=lang, env=VENV_DIR))
|
||||
|
||||
|
||||
|
@ -32,6 +33,10 @@ def make():
|
|||
local('pip install -r requirements.txt')
|
||||
local('python setup.py build_ext --inplace')
|
||||
|
||||
def sdist():
|
||||
with virtualenv(VENV_DIR):
|
||||
with lcd(path.dirname(__file__)):
|
||||
local('python setup.py sdist')
|
||||
|
||||
def clean():
|
||||
with lcd(path.dirname(__file__)):
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
cython<0.24
|
||||
cython>=0.24,<0.27.0
|
||||
pathlib
|
||||
numpy>=1.7
|
||||
cymem>=1.30,<1.32
|
||||
preshed>=1.0.0,<2.0.0
|
||||
thinc>=6.7.3,<6.8.0
|
||||
thinc>=6.9.0,<6.10.0
|
||||
murmurhash>=0.28,<0.29
|
||||
plac<1.0.0,>=0.9.6
|
||||
six
|
||||
|
@ -13,7 +13,7 @@ requests>=2.13.0,<3.0.0
|
|||
regex==2017.4.5
|
||||
ftfy>=4.4.2,<5.0.0
|
||||
pytest>=3.0.6,<4.0.0
|
||||
pip>=9.0.0,<10.0.0
|
||||
mock>=2.0.0,<3.0.0
|
||||
msgpack-python
|
||||
msgpack-numpy
|
||||
html5lib==1.0b8
|
||||
|
|
9
setup.py
9
setup.py
|
@ -28,7 +28,9 @@ MOD_NAMES = [
|
|||
'spacy.pipeline',
|
||||
'spacy.syntax.stateclass',
|
||||
'spacy.syntax._state',
|
||||
'spacy.syntax._beam_utils',
|
||||
'spacy.tokenizer',
|
||||
'spacy._cfile',
|
||||
'spacy.syntax.parser',
|
||||
'spacy.syntax.nn_parser',
|
||||
'spacy.syntax.beam_parser',
|
||||
|
@ -51,7 +53,8 @@ MOD_NAMES = [
|
|||
COMPILE_OPTIONS = {
|
||||
'msvc': ['/Ox', '/EHsc'],
|
||||
'mingw32' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function'],
|
||||
'other' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function']
|
||||
'other' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function',
|
||||
'-march=native']
|
||||
}
|
||||
|
||||
|
||||
|
@ -187,14 +190,14 @@ def setup_package():
|
|||
url=about['__uri__'],
|
||||
license=about['__license__'],
|
||||
ext_modules=ext_modules,
|
||||
scripts=['bin/spacy'],
|
||||
install_requires=[
|
||||
'numpy>=1.7',
|
||||
'murmurhash>=0.28,<0.29',
|
||||
'cymem>=1.30,<1.32',
|
||||
'preshed>=1.0.0,<2.0.0',
|
||||
'thinc>=6.7.3,<6.8.0',
|
||||
'thinc>=6.9.0,<6.10.0',
|
||||
'plac<1.0.0,>=0.9.6',
|
||||
'pip>=9.0.0,<10.0.0',
|
||||
'six',
|
||||
'pathlib',
|
||||
'ujson>=1.35',
|
||||
|
|
|
@ -4,14 +4,21 @@ from __future__ import unicode_literals
|
|||
from .cli.info import info as cli_info
|
||||
from .glossary import explain
|
||||
from .deprecated import resolve_load_name
|
||||
#from .about import __version__
|
||||
from .about import __version__
|
||||
from . import util
|
||||
|
||||
|
||||
def load(name, **overrides):
|
||||
from .deprecated import resolve_load_name
|
||||
name = resolve_load_name(name, **overrides)
|
||||
return util.load_model(name, **overrides)
|
||||
|
||||
|
||||
def blank(name, **kwargs):
|
||||
LangClass = util.get_lang_class(name)
|
||||
return LangClass(**kwargs)
|
||||
|
||||
|
||||
def info(model=None, markdown=False):
|
||||
return cli_info(None, model, markdown)
|
||||
|
|
|
@ -3,15 +3,25 @@ from __future__ import print_function
|
|||
# NB! This breaks in plac on Python 2!!
|
||||
#from __future__ import unicode_literals
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import plac
|
||||
import sys
|
||||
from spacy.cli import download, link, info, package, train, convert
|
||||
from spacy.cli import download, link, info, package, train, convert, model
|
||||
from spacy.cli import profile, evaluate, validate
|
||||
from spacy.util import prints
|
||||
|
||||
commands = {'download': download, 'link': link, 'info': info, 'train': train,
|
||||
'convert': convert, 'package': package}
|
||||
commands = {
|
||||
'download': download,
|
||||
'link': link,
|
||||
'info': info,
|
||||
'train': train,
|
||||
'evaluate': evaluate,
|
||||
'convert': convert,
|
||||
'package': package,
|
||||
'model': model,
|
||||
'profile': profile,
|
||||
'validate': validate
|
||||
}
|
||||
if len(sys.argv) == 1:
|
||||
prints(', '.join(commands), title="Available commands", exits=1)
|
||||
command = sys.argv.pop(1)
|
||||
|
@ -19,5 +29,7 @@ if __name__ == '__main__':
|
|||
if command in commands:
|
||||
plac.call(commands[command])
|
||||
else:
|
||||
prints("Available: %s" % ', '.join(commands),
|
||||
title="Unknown command: %s" % command, exits=1)
|
||||
prints(
|
||||
"Available: %s" % ', '.join(commands),
|
||||
title="Unknown command: %s" % command,
|
||||
exits=1)
|
||||
|
|
26
spacy/_cfile.pxd
Normal file
26
spacy/_cfile.pxd
Normal file
|
@ -0,0 +1,26 @@
|
|||
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
cdef class CFile:
|
||||
cdef FILE* fp
|
||||
cdef bint is_open
|
||||
cdef Pool mem
|
||||
cdef int size # For compatibility with subclass
|
||||
cdef int _capacity # For compatibility with subclass
|
||||
|
||||
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
|
||||
|
||||
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
|
||||
|
||||
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *
|
||||
|
||||
|
||||
|
||||
cdef class StringCFile(CFile):
|
||||
cdef unsigned char* data
|
||||
|
||||
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
|
||||
|
||||
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
|
||||
|
||||
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *
|
88
spacy/_cfile.pyx
Normal file
88
spacy/_cfile.pyx
Normal file
|
@ -0,0 +1,88 @@
|
|||
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
||||
from libc.string cimport memcpy
|
||||
|
||||
|
||||
cdef class CFile:
|
||||
def __init__(self, loc, mode, on_open_error=None):
|
||||
if isinstance(mode, unicode):
|
||||
mode_str = mode.encode('ascii')
|
||||
else:
|
||||
mode_str = mode
|
||||
if hasattr(loc, 'as_posix'):
|
||||
loc = loc.as_posix()
|
||||
self.mem = Pool()
|
||||
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
||||
self.fp = fopen(<char*>bytes_loc, mode_str)
|
||||
if self.fp == NULL:
|
||||
if on_open_error is not None:
|
||||
on_open_error()
|
||||
else:
|
||||
raise IOError("Could not open binary file %s" % bytes_loc)
|
||||
self.is_open = True
|
||||
|
||||
def __dealloc__(self):
|
||||
if self.is_open:
|
||||
fclose(self.fp)
|
||||
|
||||
def close(self):
|
||||
fclose(self.fp)
|
||||
self.is_open = False
|
||||
|
||||
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
|
||||
st = fread(dest, elem_size, number, self.fp)
|
||||
if st != number:
|
||||
raise IOError
|
||||
|
||||
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1:
|
||||
st = fwrite(src, elem_size, number, self.fp)
|
||||
if st != number:
|
||||
raise IOError
|
||||
|
||||
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
|
||||
cdef void* dest = mem.alloc(number, elem_size)
|
||||
self.read_into(dest, number, elem_size)
|
||||
return dest
|
||||
|
||||
def write_unicode(self, unicode value):
|
||||
cdef bytes py_bytes = value.encode('utf8')
|
||||
cdef char* chars = <char*>py_bytes
|
||||
self.write(sizeof(char), len(py_bytes), chars)
|
||||
|
||||
|
||||
cdef class StringCFile:
|
||||
def __init__(self, mode, bytes data=b'', on_open_error=None):
|
||||
self.mem = Pool()
|
||||
self.is_open = 'w' in mode
|
||||
self._capacity = max(len(data), 8)
|
||||
self.size = len(data)
|
||||
self.data = <unsigned char*>self.mem.alloc(1, self._capacity)
|
||||
for i in range(len(data)):
|
||||
self.data[i] = data[i]
|
||||
|
||||
def close(self):
|
||||
self.is_open = False
|
||||
|
||||
def string_data(self):
|
||||
return (self.data-self.size)[:self.size]
|
||||
|
||||
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
|
||||
memcpy(dest, self.data, elem_size * number)
|
||||
self.data += elem_size * number
|
||||
|
||||
cdef int write_from(self, void* src, size_t elem_size, size_t number) except -1:
|
||||
write_size = number * elem_size
|
||||
if (self.size + write_size) >= self._capacity:
|
||||
self._capacity = (self.size + write_size) * 2
|
||||
self.data = <unsigned char*>self.mem.realloc(self.data, self._capacity)
|
||||
memcpy(&self.data[self.size], src, elem_size * number)
|
||||
self.size += write_size
|
||||
|
||||
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
|
||||
cdef void* dest = mem.alloc(number, elem_size)
|
||||
self.read_into(dest, number, elem_size)
|
||||
return dest
|
||||
|
||||
def write_unicode(self, unicode value):
|
||||
cdef bytes py_bytes = value.encode('utf8')
|
||||
cdef char* chars = <char*>py_bytes
|
||||
self.write(sizeof(char), len(py_bytes), chars)
|
523
spacy/_ml.py
523
spacy/_ml.py
|
@ -1,24 +1,106 @@
|
|||
import ujson
|
||||
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
|
||||
from thinc.neural import Model, Maxout, Softmax, Affine
|
||||
from thinc.neural._classes.hash_embed import HashEmbed
|
||||
from thinc.neural.ops import NumpyOps, CupyOps
|
||||
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
|
||||
from thinc.i2v import HashEmbed, StaticVectors
|
||||
from thinc.t2t import ExtractWindow, ParametricAttention
|
||||
from thinc.t2v import Pooling, max_pool, mean_pool, sum_pool
|
||||
from thinc.misc import Residual
|
||||
from thinc.misc import BatchNorm as BN
|
||||
from thinc.misc import LayerNorm as LN
|
||||
|
||||
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
|
||||
from thinc.api import FeatureExtracter, with_getitem
|
||||
from thinc.api import uniqued, wrap, flatten_add_lengths, noop
|
||||
|
||||
from thinc.linear.linear import LinearModel
|
||||
from thinc.neural.ops import NumpyOps, CupyOps
|
||||
from thinc.neural.util import get_array_module
|
||||
|
||||
import random
|
||||
import cytoolz
|
||||
|
||||
from thinc.neural._classes.convolution import ExtractWindow
|
||||
from thinc.neural._classes.static_vectors import StaticVectors
|
||||
from thinc.neural._classes.batchnorm import BatchNorm
|
||||
from thinc.neural._classes.resnet import Residual
|
||||
from thinc.neural import ReLu
|
||||
from thinc import describe
|
||||
from thinc.describe import Dimension, Synapses, Biases, Gradient
|
||||
from thinc.neural._classes.affine import _set_dimensions_if_needed
|
||||
import thinc.extra.load_nlp
|
||||
|
||||
from .attrs import ID, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP
|
||||
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP, CLUSTER
|
||||
from .tokens.doc import Doc
|
||||
from . import util
|
||||
|
||||
import numpy
|
||||
import io
|
||||
|
||||
# TODO: Unset this once we don't want to support models previous models.
|
||||
import thinc.neural._classes.layernorm
|
||||
thinc.neural._classes.layernorm.set_compat_six_eight(False)
|
||||
|
||||
VECTORS_KEY = 'spacy_pretrained_vectors'
|
||||
|
||||
@layerize
|
||||
def _flatten_add_lengths(seqs, pad=0, drop=0.):
|
||||
ops = Model.ops
|
||||
lengths = ops.asarray([len(seq) for seq in seqs], dtype='i')
|
||||
def finish_update(d_X, sgd=None):
|
||||
return ops.unflatten(d_X, lengths, pad=pad)
|
||||
X = ops.flatten(seqs, pad=pad)
|
||||
return (X, lengths), finish_update
|
||||
|
||||
|
||||
@layerize
|
||||
def _logistic(X, drop=0.):
|
||||
xp = get_array_module(X)
|
||||
if not isinstance(X, xp.ndarray):
|
||||
X = xp.asarray(X)
|
||||
# Clip to range (-10, 10)
|
||||
X = xp.minimum(X, 10., X)
|
||||
X = xp.maximum(X, -10., X)
|
||||
Y = 1. / (1. + xp.exp(-X))
|
||||
def logistic_bwd(dY, sgd=None):
|
||||
dX = dY * (Y * (1-Y))
|
||||
return dX
|
||||
return Y, logistic_bwd
|
||||
|
||||
|
||||
@layerize
|
||||
def add_tuples(X, drop=0.):
|
||||
"""Give inputs of sequence pairs, where each sequence is (vals, length),
|
||||
sum the values, returning a single sequence.
|
||||
|
||||
If input is:
|
||||
((vals1, length), (vals2, length)
|
||||
Output is:
|
||||
(vals1+vals2, length)
|
||||
|
||||
vals are a single tensor for the whole batch.
|
||||
"""
|
||||
(vals1, length1), (vals2, length2) = X
|
||||
assert length1 == length2
|
||||
|
||||
def add_tuples_bwd(dY, sgd=None):
|
||||
return (dY, dY)
|
||||
|
||||
return (vals1+vals2, length), add_tuples_bwd
|
||||
|
||||
|
||||
def _zero_init(model):
|
||||
def _zero_init_impl(self, X, y):
|
||||
self.W.fill(0)
|
||||
model.on_data_hooks.append(_zero_init_impl)
|
||||
if model.W is not None:
|
||||
model.W.fill(0.)
|
||||
return model
|
||||
|
||||
|
||||
@layerize
|
||||
def _preprocess_doc(docs, drop=0.):
|
||||
keys = [doc.to_array([LOWER]) for doc in docs]
|
||||
keys = [a[:, 0] for a in keys]
|
||||
ops = Model.ops
|
||||
lengths = ops.asarray([arr.shape[0] for arr in keys])
|
||||
keys = ops.xp.concatenate(keys)
|
||||
vals = ops.allocate(keys.shape[0]) + 1
|
||||
return (keys, vals, lengths), None
|
||||
|
||||
|
||||
def _init_for_precomputed(W, ops):
|
||||
if (W**2).sum() != 0.:
|
||||
|
@ -27,6 +109,7 @@ def _init_for_precomputed(W, ops):
|
|||
ops.xavier_uniform_init(reshaped)
|
||||
W[:] = reshaped.reshape(W.shape)
|
||||
|
||||
|
||||
@describe.on_data(_set_dimensions_if_needed)
|
||||
@describe.attributes(
|
||||
nI=Dimension("Input size"),
|
||||
|
@ -130,34 +213,163 @@ class PrecomputableMaxouts(Model):
|
|||
return dXf
|
||||
return Yfp, backward
|
||||
|
||||
def Tok2Vec(width, embed_size, preprocess=None):
|
||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE]
|
||||
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
|
||||
norm = get_col(cols.index(NORM)) >> HashEmbed(width, embed_size, name='embed_lower')
|
||||
prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2, name='embed_prefix')
|
||||
suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2, name='embed_suffix')
|
||||
shape = get_col(cols.index(SHAPE)) >> HashEmbed(width, embed_size//2, name='embed_shape')
|
||||
# Thinc's Embed class is a bit broken atm, so drop this here.
|
||||
from thinc import describe
|
||||
from thinc.neural._classes.embed import _uniform_init
|
||||
|
||||
embed = (norm | prefix | suffix | shape )
|
||||
tok2vec = (
|
||||
with_flatten(
|
||||
asarray(Model.ops, dtype='uint64')
|
||||
>> embed
|
||||
>> Maxout(width, width*4, pieces=3)
|
||||
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
|
||||
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
|
||||
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
|
||||
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)),
|
||||
pad=4)
|
||||
|
||||
@describe.attributes(
|
||||
nV=describe.Dimension("Number of vectors"),
|
||||
nO=describe.Dimension("Size of output"),
|
||||
vectors=describe.Weights("Embedding table",
|
||||
lambda obj: (obj.nV, obj.nO),
|
||||
_uniform_init(-0.1, 0.1)
|
||||
),
|
||||
d_vectors=describe.Gradient("vectors")
|
||||
)
|
||||
class Embed(Model):
|
||||
name = 'embed'
|
||||
|
||||
def __init__(self, nO, nV=None, **kwargs):
|
||||
if nV is not None:
|
||||
nV += 1
|
||||
Model.__init__(self, **kwargs)
|
||||
if 'name' in kwargs:
|
||||
self.name = kwargs['name']
|
||||
self.column = kwargs.get('column', 0)
|
||||
self.nO = nO
|
||||
self.nV = nV
|
||||
|
||||
def predict(self, ids):
|
||||
if ids.ndim == 2:
|
||||
ids = ids[:, self.column]
|
||||
return self.ops.xp.ascontiguousarray(self.vectors[ids], dtype='f')
|
||||
|
||||
def begin_update(self, ids, drop=0.):
|
||||
if ids.ndim == 2:
|
||||
ids = ids[:, self.column]
|
||||
vectors = self.ops.xp.ascontiguousarray(self.vectors[ids], dtype='f')
|
||||
def backprop_embed(d_vectors, sgd=None):
|
||||
n_vectors = d_vectors.shape[0]
|
||||
self.ops.scatter_add(self.d_vectors, ids, d_vectors)
|
||||
if sgd is not None:
|
||||
sgd(self._mem.weights, self._mem.gradient, key=self.id)
|
||||
return None
|
||||
return vectors, backprop_embed
|
||||
|
||||
|
||||
def HistoryFeatures(nr_class, hist_size=8, nr_dim=8):
|
||||
'''Wrap a model, adding features representing action history.'''
|
||||
if hist_size == 0:
|
||||
return layerize(noop())
|
||||
embed_tables = [Embed(nr_dim, nr_class, column=i, name='embed%d')
|
||||
for i in range(hist_size)]
|
||||
embed = chain(concatenate(*embed_tables),
|
||||
LN(Maxout(hist_size*nr_dim, hist_size*nr_dim)))
|
||||
ops = embed.ops
|
||||
def add_history_fwd(vectors_hists, drop=0.):
|
||||
vectors, hist_ids = vectors_hists
|
||||
hist_feats, bp_hists = embed.begin_update(hist_ids, drop=drop)
|
||||
outputs = ops.xp.hstack((vectors, hist_feats))
|
||||
|
||||
def add_history_bwd(d_outputs, sgd=None):
|
||||
d_vectors = d_outputs[:, :vectors.shape[1]]
|
||||
d_hists = d_outputs[:, vectors.shape[1]:]
|
||||
bp_hists(d_hists, sgd=sgd)
|
||||
return embed.ops.xp.ascontiguousarray(d_vectors)
|
||||
return outputs, add_history_bwd
|
||||
return wrap(add_history_fwd, embed)
|
||||
|
||||
|
||||
def drop_layer(layer, factor=2.):
|
||||
def drop_layer_fwd(X, drop=0.):
|
||||
if drop <= 0.:
|
||||
return layer.begin_update(X, drop=drop)
|
||||
else:
|
||||
coinflip = layer.ops.xp.random.random()
|
||||
if (coinflip / factor) >= drop:
|
||||
return layer.begin_update(X, drop=drop)
|
||||
else:
|
||||
return X, lambda dX, sgd=None: dX
|
||||
|
||||
model = wrap(drop_layer_fwd, layer)
|
||||
model.predict = layer
|
||||
return model
|
||||
|
||||
def link_vectors_to_models(vocab):
|
||||
vectors = vocab.vectors
|
||||
ops = Model.ops
|
||||
for word in vocab:
|
||||
if word.orth in vectors.key2row:
|
||||
word.rank = vectors.key2row[word.orth]
|
||||
else:
|
||||
word.rank = 0
|
||||
data = ops.asarray(vectors.data)
|
||||
# Set an entry here, so that vectors are accessed by StaticVectors
|
||||
# (unideal, I know)
|
||||
thinc.extra.load_nlp.VECTORS[(ops.device, VECTORS_KEY)] = data
|
||||
|
||||
def Tok2Vec(width, embed_size, **kwargs):
|
||||
pretrained_dims = kwargs.get('pretrained_dims', 0)
|
||||
cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2)
|
||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add,
|
||||
'*': reapply}):
|
||||
norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm')
|
||||
prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix')
|
||||
suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix')
|
||||
shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape')
|
||||
if pretrained_dims is not None and pretrained_dims >= 1:
|
||||
glove = StaticVectors(VECTORS_KEY, width, column=cols.index(ID))
|
||||
|
||||
embed = uniqued(
|
||||
(glove | norm | prefix | suffix | shape)
|
||||
>> LN(Maxout(width, width*5, pieces=3)), column=5)
|
||||
else:
|
||||
embed = uniqued(
|
||||
(norm | prefix | suffix | shape)
|
||||
>> LN(Maxout(width, width*4, pieces=3)), column=5)
|
||||
|
||||
|
||||
convolution = Residual(
|
||||
ExtractWindow(nW=1)
|
||||
>> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces))
|
||||
)
|
||||
if preprocess not in (False, None):
|
||||
tok2vec = preprocess >> tok2vec
|
||||
|
||||
tok2vec = (
|
||||
FeatureExtracter(cols)
|
||||
>> with_flatten(
|
||||
embed >> (convolution ** 4), pad=4)
|
||||
)
|
||||
|
||||
# Work around thinc API limitations :(. TODO: Revise in Thinc 7
|
||||
tok2vec.nO = width
|
||||
tok2vec.embed = embed
|
||||
return tok2vec
|
||||
|
||||
|
||||
def reapply(layer, n_times):
|
||||
def reapply_fwd(X, drop=0.):
|
||||
backprops = []
|
||||
for i in range(n_times):
|
||||
Y, backprop = layer.begin_update(X, drop=drop)
|
||||
X = Y
|
||||
backprops.append(backprop)
|
||||
def reapply_bwd(dY, sgd=None):
|
||||
dX = None
|
||||
for backprop in reversed(backprops):
|
||||
dY = backprop(dY, sgd=sgd)
|
||||
if dX is None:
|
||||
dX = dY
|
||||
else:
|
||||
dX += dY
|
||||
return dX
|
||||
return Y, reapply_bwd
|
||||
return wrap(reapply_fwd, layer)
|
||||
|
||||
|
||||
|
||||
|
||||
def asarray(ops, dtype):
|
||||
def forward(X, drop=0.):
|
||||
return ops.asarray(X, dtype=dtype), None
|
||||
|
@ -243,7 +455,8 @@ def zero_init(model):
|
|||
|
||||
|
||||
def doc2feats(cols=None):
|
||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE]
|
||||
if cols is None:
|
||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||
def forward(docs, drop=0.):
|
||||
feats = []
|
||||
for doc in docs:
|
||||
|
@ -269,6 +482,46 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.):
|
|||
return vectors, backward
|
||||
|
||||
|
||||
def fine_tune(embedding, combine=None):
|
||||
if combine is not None:
|
||||
raise NotImplementedError(
|
||||
"fine_tune currently only supports addition. Set combine=None")
|
||||
def fine_tune_fwd(docs_tokvecs, drop=0.):
|
||||
docs, tokvecs = docs_tokvecs
|
||||
|
||||
lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i')
|
||||
|
||||
vecs, bp_vecs = embedding.begin_update(docs, drop=drop)
|
||||
flat_tokvecs = embedding.ops.flatten(tokvecs)
|
||||
flat_vecs = embedding.ops.flatten(vecs)
|
||||
output = embedding.ops.unflatten(
|
||||
(model.mix[0] * flat_tokvecs + model.mix[1] * flat_vecs), lengths)
|
||||
|
||||
def fine_tune_bwd(d_output, sgd=None):
|
||||
flat_grad = model.ops.flatten(d_output)
|
||||
model.d_mix[0] += flat_tokvecs.dot(flat_grad.T).sum()
|
||||
model.d_mix[1] += flat_vecs.dot(flat_grad.T).sum()
|
||||
|
||||
bp_vecs([d_o * model.mix[1] for d_o in d_output], sgd=sgd)
|
||||
if sgd is not None:
|
||||
sgd(model._mem.weights, model._mem.gradient, key=model.id)
|
||||
return [d_o * model.mix[0] for d_o in d_output]
|
||||
return output, fine_tune_bwd
|
||||
|
||||
def fine_tune_predict(docs_tokvecs):
|
||||
docs, tokvecs = docs_tokvecs
|
||||
vecs = embedding(docs)
|
||||
return [model.mix[0]*tv+model.mix[1]*v
|
||||
for tv, v in zip(tokvecs, vecs)]
|
||||
|
||||
model = wrap(fine_tune_fwd, embedding)
|
||||
model.mix = model._mem.add((model.id, 'mix'), (2,))
|
||||
model.mix.fill(0.5)
|
||||
model.d_mix = model._mem.add_gradient((model.id, 'd_mix'), (model.id, 'mix'))
|
||||
model.predict = fine_tune_predict
|
||||
return model
|
||||
|
||||
|
||||
@layerize
|
||||
def flatten(seqs, drop=0.):
|
||||
if isinstance(seqs[0], numpy.ndarray):
|
||||
|
@ -282,3 +535,211 @@ def flatten(seqs, drop=0.):
|
|||
return ops.unflatten(d_X, lengths)
|
||||
X = ops.xp.vstack(seqs)
|
||||
return X, finish_update
|
||||
|
||||
|
||||
@layerize
|
||||
def logistic(X, drop=0.):
|
||||
xp = get_array_module(X)
|
||||
if not isinstance(X, xp.ndarray):
|
||||
X = xp.asarray(X)
|
||||
# Clip to range (-10, 10)
|
||||
X = xp.minimum(X, 10., X)
|
||||
X = xp.maximum(X, -10., X)
|
||||
Y = 1. / (1. + xp.exp(-X))
|
||||
def logistic_bwd(dY, sgd=None):
|
||||
dX = dY * (Y * (1-Y))
|
||||
return dX
|
||||
return Y, logistic_bwd
|
||||
|
||||
|
||||
def zero_init(model):
|
||||
def _zero_init_impl(self, X, y):
|
||||
self.W.fill(0)
|
||||
model.on_data_hooks.append(_zero_init_impl)
|
||||
return model
|
||||
|
||||
@layerize
|
||||
def preprocess_doc(docs, drop=0.):
|
||||
keys = [doc.to_array([LOWER]) for doc in docs]
|
||||
keys = [a[:, 0] for a in keys]
|
||||
ops = Model.ops
|
||||
lengths = ops.asarray([arr.shape[0] for arr in keys])
|
||||
keys = ops.xp.concatenate(keys)
|
||||
vals = ops.allocate(keys.shape[0]) + 1
|
||||
return (keys, vals, lengths), None
|
||||
|
||||
def getitem(i):
|
||||
def getitem_fwd(X, drop=0.):
|
||||
return X[i], None
|
||||
return layerize(getitem_fwd)
|
||||
|
||||
def build_tagger_model(nr_class, **cfg):
|
||||
embed_size = util.env_opt('embed_size', 7000)
|
||||
if 'token_vector_width' in cfg:
|
||||
token_vector_width = cfg['token_vector_width']
|
||||
else:
|
||||
token_vector_width = util.env_opt('token_vector_width', 128)
|
||||
pretrained_dims = cfg.get('pretrained_dims', 0)
|
||||
with Model.define_operators({'>>': chain, '+': add}):
|
||||
if 'tok2vec' in cfg:
|
||||
tok2vec = cfg['tok2vec']
|
||||
else:
|
||||
tok2vec = Tok2Vec(token_vector_width, embed_size,
|
||||
pretrained_dims=pretrained_dims)
|
||||
model = (
|
||||
tok2vec
|
||||
>> with_flatten(Softmax(nr_class, token_vector_width))
|
||||
)
|
||||
model.nI = None
|
||||
model.tok2vec = tok2vec
|
||||
return model
|
||||
|
||||
|
||||
@layerize
|
||||
def SpacyVectors(docs, drop=0.):
|
||||
xp = get_array_module(docs[0].vocab.vectors.data)
|
||||
width = docs[0].vocab.vectors.data.shape[1]
|
||||
batch = []
|
||||
for doc in docs:
|
||||
indices = numpy.zeros((len(doc),), dtype='i')
|
||||
for i, word in enumerate(doc):
|
||||
if word.orth in doc.vocab.vectors.key2row:
|
||||
indices[i] = doc.vocab.vectors.key2row[word.orth]
|
||||
else:
|
||||
indices[i] = 0
|
||||
vectors = doc.vocab.vectors.data[indices]
|
||||
batch.append(vectors)
|
||||
return batch, None
|
||||
|
||||
|
||||
def foreach(layer, drop_factor=1.0):
|
||||
'''Map a layer across elements in a list'''
|
||||
def foreach_fwd(Xs, drop=0.):
|
||||
drop *= drop_factor
|
||||
ys = []
|
||||
backprops = []
|
||||
for X in Xs:
|
||||
y, bp_y = layer.begin_update(X, drop=drop)
|
||||
ys.append(y)
|
||||
backprops.append(bp_y)
|
||||
def foreach_bwd(d_ys, sgd=None):
|
||||
d_Xs = []
|
||||
for d_y, bp_y in zip(d_ys, backprops):
|
||||
if bp_y is not None and bp_y is not None:
|
||||
d_Xs.append(d_y, sgd=sgd)
|
||||
else:
|
||||
d_Xs.append(None)
|
||||
return d_Xs
|
||||
return ys, foreach_bwd
|
||||
model = wrap(foreach_fwd, layer)
|
||||
return model
|
||||
|
||||
|
||||
def build_text_classifier(nr_class, width=64, **cfg):
|
||||
nr_vector = cfg.get('nr_vector', 5000)
|
||||
pretrained_dims = cfg.get('pretrained_dims', 0)
|
||||
with Model.define_operators({'>>': chain, '+': add, '|': concatenate,
|
||||
'**': clone}):
|
||||
if cfg.get('low_data'):
|
||||
model = (
|
||||
SpacyVectors
|
||||
>> flatten_add_lengths
|
||||
>> with_getitem(0,
|
||||
Affine(width, pretrained_dims)
|
||||
)
|
||||
>> ParametricAttention(width)
|
||||
>> Pooling(sum_pool)
|
||||
>> Residual(ReLu(width, width)) ** 2
|
||||
>> zero_init(Affine(nr_class, width, drop_factor=0.0))
|
||||
>> logistic
|
||||
)
|
||||
return model
|
||||
|
||||
|
||||
lower = HashEmbed(width, nr_vector, column=1)
|
||||
prefix = HashEmbed(width//2, nr_vector, column=2)
|
||||
suffix = HashEmbed(width//2, nr_vector, column=3)
|
||||
shape = HashEmbed(width//2, nr_vector, column=4)
|
||||
|
||||
trained_vectors = (
|
||||
FeatureExtracter([ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID])
|
||||
>> with_flatten(
|
||||
uniqued(
|
||||
(lower | prefix | suffix | shape)
|
||||
>> LN(Maxout(width, width+(width//2)*3)),
|
||||
column=0
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
if pretrained_dims:
|
||||
static_vectors = (
|
||||
SpacyVectors
|
||||
>> with_flatten(Affine(width, pretrained_dims))
|
||||
)
|
||||
# TODO Make concatenate support lists
|
||||
vectors = concatenate_lists(trained_vectors, static_vectors)
|
||||
vectors_width = width*2
|
||||
else:
|
||||
vectors = trained_vectors
|
||||
vectors_width = width
|
||||
static_vectors = None
|
||||
cnn_model = (
|
||||
vectors
|
||||
>> with_flatten(
|
||||
LN(Maxout(width, vectors_width))
|
||||
>> Residual(
|
||||
(ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
|
||||
) ** 2, pad=2
|
||||
)
|
||||
>> flatten_add_lengths
|
||||
>> ParametricAttention(width)
|
||||
>> Pooling(sum_pool)
|
||||
>> Residual(zero_init(Maxout(width, width)))
|
||||
>> zero_init(Affine(nr_class, width, drop_factor=0.0))
|
||||
)
|
||||
|
||||
linear_model = (
|
||||
_preprocess_doc
|
||||
>> LinearModel(nr_class, drop_factor=0.)
|
||||
)
|
||||
|
||||
model = (
|
||||
(linear_model | cnn_model)
|
||||
>> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0))
|
||||
>> logistic
|
||||
)
|
||||
model.nO = nr_class
|
||||
model.lsuv = False
|
||||
return model
|
||||
|
||||
@layerize
|
||||
def flatten(seqs, drop=0.):
|
||||
ops = Model.ops
|
||||
lengths = ops.asarray([len(seq) for seq in seqs], dtype='i')
|
||||
def finish_update(d_X, sgd=None):
|
||||
return ops.unflatten(d_X, lengths, pad=0)
|
||||
X = ops.flatten(seqs, pad=0)
|
||||
return X, finish_update
|
||||
|
||||
|
||||
def concatenate_lists(*layers, **kwargs): # pragma: no cover
|
||||
'''Compose two or more models `f`, `g`, etc, such that their outputs are
|
||||
concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))`
|
||||
'''
|
||||
if not layers:
|
||||
return noop()
|
||||
drop_factor = kwargs.get('drop_factor', 1.0)
|
||||
ops = layers[0].ops
|
||||
layers = [chain(layer, flatten) for layer in layers]
|
||||
concat = concatenate(*layers)
|
||||
def concatenate_lists_fwd(Xs, drop=0.):
|
||||
drop *= drop_factor
|
||||
lengths = ops.asarray([len(X) for X in Xs], dtype='i')
|
||||
flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
|
||||
ys = ops.unflatten(flat_y, lengths)
|
||||
def concatenate_lists_bwd(d_ys, sgd=None):
|
||||
return bp_flat_y(ops.flatten(d_ys), sgd=sgd)
|
||||
return ys, concatenate_lists_bwd
|
||||
model = wrap(concatenate_lists_fwd, concat)
|
||||
return model
|
||||
|
|
|
@ -3,14 +3,15 @@
|
|||
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
||||
|
||||
__title__ = 'spacy-nightly'
|
||||
__version__ = '2.0.0a1'
|
||||
__version__ = '2.0.0a17'
|
||||
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
|
||||
__uri__ = 'https://spacy.io'
|
||||
__author__ = 'Explosion AI'
|
||||
__email__ = 'contact@explosion.ai'
|
||||
__license__ = 'MIT'
|
||||
__release__ = False
|
||||
|
||||
__docs_models__ = 'https://spacy.io/docs/usage/models'
|
||||
__docs_models__ = 'https://alpha.spacy.io/usage/models'
|
||||
__download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
|
||||
__compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json'
|
||||
__shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts.json'
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# Reserve 64 values for flag features
|
||||
cpdef enum attr_id_t:
|
||||
cdef enum attr_id_t:
|
||||
NULL_ATTR
|
||||
IS_ALPHA
|
||||
IS_ASCII
|
||||
|
|
|
@ -94,6 +94,7 @@ IDS = {
|
|||
|
||||
# ATTR IDs, in order of the symbol
|
||||
NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
|
||||
locals().update(IDS)
|
||||
|
||||
|
||||
def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
||||
|
|
|
@ -2,5 +2,9 @@ from .download import download
|
|||
from .info import info
|
||||
from .link import link
|
||||
from .package import package
|
||||
from .profile import profile
|
||||
from .train import train
|
||||
from .evaluate import evaluate
|
||||
from .convert import convert
|
||||
from .model import model
|
||||
from .validate import validate
|
||||
|
|
|
@ -4,7 +4,7 @@ from __future__ import unicode_literals
|
|||
import plac
|
||||
from pathlib import Path
|
||||
|
||||
from .converters import conllu2json, iob2json
|
||||
from .converters import conllu2json, iob2json, conll_ner2json
|
||||
from ..util import prints
|
||||
|
||||
# Converters are matched by file extension. To add a converter, add a new entry
|
||||
|
@ -12,19 +12,22 @@ from ..util import prints
|
|||
# from /converters.
|
||||
|
||||
CONVERTERS = {
|
||||
'.conllu': conllu2json,
|
||||
'.conll': conllu2json,
|
||||
'.iob': iob2json
|
||||
'conllu': conllu2json,
|
||||
'conll': conllu2json,
|
||||
'ner': conll_ner2json,
|
||||
'iob': iob2json,
|
||||
}
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
input_file=("input file", "positional", None, str),
|
||||
output_dir=("output directory for converted file", "positional", None, str),
|
||||
n_sents=("Number of sentences per doc", "option", "n", float),
|
||||
n_sents=("Number of sentences per doc", "option", "n", int),
|
||||
converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
|
||||
morphology=("Enable appending morphology to tags", "flag", "m", bool)
|
||||
)
|
||||
def convert(cmd, input_file, output_dir, n_sents, morphology):
|
||||
def convert(cmd, input_file, output_dir, n_sents=1, morphology=False,
|
||||
converter='auto'):
|
||||
"""
|
||||
Convert files into JSON format for use with train command and other
|
||||
experiment management functions.
|
||||
|
@ -35,9 +38,11 @@ def convert(cmd, input_file, output_dir, n_sents, morphology):
|
|||
prints(input_path, title="Input file not found", exits=1)
|
||||
if not output_path.exists():
|
||||
prints(output_path, title="Output directory not found", exits=1)
|
||||
file_ext = input_path.suffix
|
||||
if not file_ext in CONVERTERS:
|
||||
prints("Can't find converter for %s" % input_path.parts[-1],
|
||||
title="Unknown format", exits=1)
|
||||
CONVERTERS[file_ext](input_path, output_path,
|
||||
n_sents=n_sents, use_morphology=morphology)
|
||||
if converter == 'auto':
|
||||
converter = input_path.suffix[1:]
|
||||
if not converter in CONVERTERS:
|
||||
prints("Can't find converter for %s" % converter,
|
||||
title="Unknown format", exits=1)
|
||||
func = CONVERTERS[converter]
|
||||
func(input_path, output_path,
|
||||
n_sents=n_sents, use_morphology=morphology)
|
||||
|
|
|
@ -1,2 +1,3 @@
|
|||
from .conllu2json import conllu2json
|
||||
from .iob2json import iob2json
|
||||
from .conll_ner2json import conll_ner2json
|
||||
|
|
50
spacy/cli/converters/conll_ner2json.py
Normal file
50
spacy/cli/converters/conll_ner2json.py
Normal file
|
@ -0,0 +1,50 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...compat import json_dumps, path2str
|
||||
from ...util import prints
|
||||
from ...gold import iob_to_biluo
|
||||
|
||||
|
||||
def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False):
|
||||
"""
|
||||
Convert files in the CoNLL-2003 NER format into JSON format for use with train cli.
|
||||
"""
|
||||
docs = read_conll_ner(input_path)
|
||||
|
||||
output_filename = input_path.parts[-1].replace(".conll", "") + ".json"
|
||||
output_filename = input_path.parts[-1].replace(".conll", "") + ".json"
|
||||
output_file = output_path / output_filename
|
||||
with output_file.open('w', encoding='utf-8') as f:
|
||||
f.write(json_dumps(docs))
|
||||
prints("Created %d documents" % len(docs),
|
||||
title="Generated output file %s" % path2str(output_file))
|
||||
|
||||
|
||||
def read_conll_ner(input_path):
|
||||
text = input_path.open('r', encoding='utf-8').read()
|
||||
i = 0
|
||||
delimit_docs = '-DOCSTART- -X- O O'
|
||||
output_docs = []
|
||||
for doc in text.strip().split(delimit_docs):
|
||||
doc = doc.strip()
|
||||
if not doc:
|
||||
continue
|
||||
output_doc = []
|
||||
for sent in doc.split('\n\n'):
|
||||
sent = sent.strip()
|
||||
if not sent:
|
||||
continue
|
||||
lines = [line.strip() for line in sent.split('\n') if line.strip()]
|
||||
words, tags, chunks, iob_ents = zip(*[line.split() for line in lines])
|
||||
biluo_ents = iob_to_biluo(iob_ents)
|
||||
output_doc.append({'tokens': [
|
||||
{'orth': w, 'tag': tag, 'ner': ent} for (w, tag, ent) in
|
||||
zip(words, tags, biluo_ents)
|
||||
]})
|
||||
output_docs.append({
|
||||
'id': len(output_docs),
|
||||
'paragraphs': [{'sentences': output_doc}]
|
||||
})
|
||||
output_doc = []
|
||||
return output_docs
|
|
@ -1,5 +1,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
from cytoolz import partition_all, concat
|
||||
|
||||
from ...compat import json_dumps, path2str
|
||||
from ...util import prints
|
||||
|
@ -10,11 +11,9 @@ def iob2json(input_path, output_path, n_sents=10, *a, **k):
|
|||
"""
|
||||
Convert IOB files into JSON format for use with train cli.
|
||||
"""
|
||||
# TODO: This isn't complete yet -- need to map from IOB to
|
||||
# BILUO
|
||||
with input_path.open('r', encoding='utf8') as file_:
|
||||
docs = read_iob(file_)
|
||||
|
||||
sentences = read_iob(file_)
|
||||
docs = merge_sentences(sentences, n_sents)
|
||||
output_filename = input_path.parts[-1].replace(".iob", ".json")
|
||||
output_file = output_path / output_filename
|
||||
with output_file.open('w', encoding='utf-8') as f:
|
||||
|
@ -23,9 +22,9 @@ def iob2json(input_path, output_path, n_sents=10, *a, **k):
|
|||
title="Generated output file %s" % path2str(output_file))
|
||||
|
||||
|
||||
def read_iob(file_):
|
||||
def read_iob(raw_sents):
|
||||
sentences = []
|
||||
for line in file_:
|
||||
for line in raw_sents:
|
||||
if not line.strip():
|
||||
continue
|
||||
tokens = [t.split('|') for t in line.split()]
|
||||
|
@ -43,3 +42,15 @@ def read_iob(file_):
|
|||
paragraphs = [{'sentences': [sent]} for sent in sentences]
|
||||
docs = [{'id': 0, 'paragraphs': [para]} for para in paragraphs]
|
||||
return docs
|
||||
|
||||
def merge_sentences(docs, n_sents):
|
||||
counter = 0
|
||||
merged = []
|
||||
for group in partition_all(n_sents, docs):
|
||||
group = list(group)
|
||||
first = group.pop(0)
|
||||
to_extend = first['paragraphs'][0]['sentences']
|
||||
for sent in group[1:]:
|
||||
to_extend.extend(sent['paragraphs'][0]['sentences'])
|
||||
merged.append(first)
|
||||
return merged
|
||||
|
|
|
@ -8,7 +8,7 @@ import subprocess
|
|||
import sys
|
||||
|
||||
from .link import link
|
||||
from ..util import prints
|
||||
from ..util import prints, get_package_path
|
||||
from .. import about
|
||||
|
||||
|
||||
|
@ -24,24 +24,29 @@ def download(cmd, model, direct=False):
|
|||
with version.
|
||||
"""
|
||||
if direct:
|
||||
download_model('{m}/{m}.tar.gz'.format(m=model))
|
||||
dl = download_model('{m}/{m}.tar.gz'.format(m=model))
|
||||
else:
|
||||
shortcuts = get_json(about.__shortcuts__, "available shortcuts")
|
||||
model_name = shortcuts.get(model, model)
|
||||
compatibility = get_compatibility()
|
||||
version = get_version(model_name, compatibility)
|
||||
download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
|
||||
try:
|
||||
link(None, model_name, model, force=True)
|
||||
except:
|
||||
# Dirty, but since spacy.download and the auto-linking is mostly
|
||||
# a convenience wrapper, it's best to show a success message and
|
||||
# loading instructions, even if linking fails.
|
||||
prints("Creating a shortcut link for 'en' didn't work (maybe you "
|
||||
"don't have admin permissions?), but you can still load "
|
||||
"the model via its full package name:",
|
||||
"nlp = spacy.load('%s')" % model_name,
|
||||
title="Download successful")
|
||||
dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
|
||||
if dl == 0:
|
||||
try:
|
||||
# Get package path here because link uses
|
||||
# pip.get_installed_distributions() to check if model is a package,
|
||||
# which fails if model was just installed via subprocess
|
||||
package_path = get_package_path(model_name)
|
||||
link(None, model_name, model, force=True, model_path=package_path)
|
||||
except:
|
||||
# Dirty, but since spacy.download and the auto-linking is mostly
|
||||
# a convenience wrapper, it's best to show a success message and
|
||||
# loading instructions, even if linking fails.
|
||||
prints("Creating a shortcut link for 'en' didn't work (maybe you "
|
||||
"don't have admin permissions?), but you can still load "
|
||||
"the model via its full package name:",
|
||||
"nlp = spacy.load('%s')" % model_name,
|
||||
title="Download successful")
|
||||
|
||||
|
||||
def get_json(url, desc):
|
||||
|
@ -73,6 +78,6 @@ def get_version(model, comp):
|
|||
|
||||
def download_model(filename):
|
||||
download_url = about.__download_url__ + '/' + filename
|
||||
subprocess.call([sys.executable, '-m',
|
||||
return subprocess.call([sys.executable, '-m',
|
||||
'pip', 'install', '--no-cache-dir', download_url],
|
||||
env=os.environ.copy())
|
||||
|
|
120
spacy/cli/evaluate.py
Normal file
120
spacy/cli/evaluate.py
Normal file
|
@ -0,0 +1,120 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals, division, print_function
|
||||
|
||||
import plac
|
||||
import json
|
||||
from collections import defaultdict
|
||||
import cytoolz
|
||||
from pathlib import Path
|
||||
import dill
|
||||
import tqdm
|
||||
from thinc.neural._classes.model import Model
|
||||
from thinc.neural.optimizers import linear_decay
|
||||
from timeit import default_timer as timer
|
||||
import random
|
||||
import numpy.random
|
||||
|
||||
from ..tokens.doc import Doc
|
||||
from ..scorer import Scorer
|
||||
from ..gold import GoldParse, merge_sents
|
||||
from ..gold import GoldCorpus, minibatch
|
||||
from ..util import prints
|
||||
from .. import util
|
||||
from .. import about
|
||||
from .. import displacy
|
||||
from ..compat import json_dumps
|
||||
|
||||
random.seed(0)
|
||||
numpy.random.seed(0)
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
model=("Model name or path", "positional", None, str),
|
||||
data_path=("Location of JSON-formatted evaluation data", "positional", None, str),
|
||||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||
gpu_id=("Use GPU", "option", "g", int),
|
||||
displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str),
|
||||
displacy_limit=("Limit of parses to render as HTML", "option", "dl", int)
|
||||
)
|
||||
def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
|
||||
displacy_path=None, displacy_limit=25):
|
||||
"""
|
||||
Evaluate a model. To render a sample of parses in a HTML file, set an output
|
||||
directory as the displacy_path argument.
|
||||
"""
|
||||
if gpu_id >= 0:
|
||||
util.use_gpu(gpu_id)
|
||||
util.set_env_log(False)
|
||||
data_path = util.ensure_path(data_path)
|
||||
displacy_path = util.ensure_path(displacy_path)
|
||||
if not data_path.exists():
|
||||
prints(data_path, title="Evaluation data not found", exits=1)
|
||||
if displacy_path and not displacy_path.exists():
|
||||
prints(displacy_path, title="Visualization output directory not found", exits=1)
|
||||
corpus = GoldCorpus(data_path, data_path)
|
||||
nlp = util.load_model(model)
|
||||
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
|
||||
begin = timer()
|
||||
scorer = nlp.evaluate(dev_docs, verbose=False)
|
||||
end = timer()
|
||||
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
|
||||
print_results(scorer, time=end - begin, words=nwords,
|
||||
wps=nwords / (end - begin))
|
||||
if displacy_path:
|
||||
docs, golds = zip(*dev_docs)
|
||||
render_deps = 'parser' in nlp.meta.get('pipeline', [])
|
||||
render_ents = 'ner' in nlp.meta.get('pipeline', [])
|
||||
render_parses(docs, displacy_path, model_name=model, limit=displacy_limit,
|
||||
deps=render_deps, ents=render_ents)
|
||||
prints(displacy_path, title="Generated %s parses as HTML" % displacy_limit)
|
||||
|
||||
|
||||
def render_parses(docs, output_path, model_name='', limit=250, deps=True, ents=True):
|
||||
docs[0].user_data['title'] = model_name
|
||||
if ents:
|
||||
with (output_path / 'entities.html').open('w') as file_:
|
||||
html = displacy.render(docs[:limit], style='ent', page=True)
|
||||
file_.write(html)
|
||||
if deps:
|
||||
with (output_path / 'parses.html').open('w') as file_:
|
||||
html = displacy.render(docs[:limit], style='dep', page=True, options={'compact': True})
|
||||
file_.write(html)
|
||||
|
||||
|
||||
def print_progress(itn, losses, dev_scores, wps=0.0):
|
||||
scores = {}
|
||||
for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
|
||||
'ents_p', 'ents_r', 'ents_f', 'wps']:
|
||||
scores[col] = 0.0
|
||||
scores['dep_loss'] = losses.get('parser', 0.0)
|
||||
scores['ner_loss'] = losses.get('ner', 0.0)
|
||||
scores['tag_loss'] = losses.get('tagger', 0.0)
|
||||
scores.update(dev_scores)
|
||||
scores['wps'] = wps
|
||||
tpl = '\t'.join((
|
||||
'{:d}',
|
||||
'{dep_loss:.3f}',
|
||||
'{ner_loss:.3f}',
|
||||
'{uas:.3f}',
|
||||
'{ents_p:.3f}',
|
||||
'{ents_r:.3f}',
|
||||
'{ents_f:.3f}',
|
||||
'{tags_acc:.3f}',
|
||||
'{token_acc:.3f}',
|
||||
'{wps:.1f}'))
|
||||
print(tpl.format(itn, **scores))
|
||||
|
||||
|
||||
def print_results(scorer, time, words, wps):
|
||||
results = {
|
||||
'Time': '%.2f s' % time,
|
||||
'Words': words,
|
||||
'Words/s': '%.0f' % wps,
|
||||
'TOK': '%.2f' % scorer.token_acc,
|
||||
'POS': '%.2f' % scorer.tags_acc,
|
||||
'UAS': '%.2f' % scorer.uas,
|
||||
'LAS': '%.2f' % scorer.las,
|
||||
'NER P': '%.2f' % scorer.ents_p,
|
||||
'NER R': '%.2f' % scorer.ents_r,
|
||||
'NER F': '%.2f' % scorer.ents_f}
|
||||
util.print_table(results, title="Results")
|
|
@ -14,7 +14,7 @@ from .. import util
|
|||
link_name=("name of shortuct link to create", "positional", None, str),
|
||||
force=("force overwriting of existing link", "flag", "f", bool)
|
||||
)
|
||||
def link(cmd, origin, link_name, force=False):
|
||||
def link(cmd, origin, link_name, force=False, model_path=None):
|
||||
"""
|
||||
Create a symlink for models within the spacy/data directory. Accepts
|
||||
either the name of a pip package, or the local path to the model data
|
||||
|
@ -23,10 +23,17 @@ def link(cmd, origin, link_name, force=False):
|
|||
if util.is_package(origin):
|
||||
model_path = util.get_package_path(origin)
|
||||
else:
|
||||
model_path = Path(origin)
|
||||
model_path = Path(origin) if model_path is None else Path(model_path)
|
||||
if not model_path.exists():
|
||||
prints("The data should be located in %s" % path2str(model_path),
|
||||
title="Can't locate model data", exits=1)
|
||||
data_path = util.get_data_path()
|
||||
if not data_path or not data_path.exists():
|
||||
spacy_loc = Path(__file__).parent.parent
|
||||
prints("Make sure a directory `/data` exists within your spaCy "
|
||||
"installation and try again. The data directory should be "
|
||||
"located here:", path2str(spacy_loc), exits=1,
|
||||
title="Can't find the spaCy data path to create model symlink")
|
||||
link_path = util.get_data_path() / link_name
|
||||
if link_path.exists() and not force:
|
||||
prints("To overwrite an existing link, use the --force flag.",
|
||||
|
|
137
spacy/cli/model.py
Normal file
137
spacy/cli/model.py
Normal file
|
@ -0,0 +1,137 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import bz2
|
||||
import gzip
|
||||
import math
|
||||
from ast import literal_eval
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import spacy
|
||||
from preshed.counter import PreshCounter
|
||||
|
||||
from .. import util
|
||||
from ..compat import fix_text
|
||||
|
||||
|
||||
def model(cmd, lang, model_dir, freqs_data, clusters_data, vectors_data,
|
||||
min_doc_freq=5, min_word_freq=200):
|
||||
model_path = Path(model_dir)
|
||||
freqs_path = Path(freqs_data)
|
||||
clusters_path = Path(clusters_data) if clusters_data else None
|
||||
vectors_path = Path(vectors_data) if vectors_data else None
|
||||
|
||||
check_dirs(freqs_path, clusters_path, vectors_path)
|
||||
vocab = util.get_lang_class(lang).Defaults.create_vocab()
|
||||
nlp = spacy.blank(lang)
|
||||
vocab = nlp.vocab
|
||||
probs, oov_prob = read_probs(
|
||||
freqs_path, min_doc_freq=int(min_doc_freq), min_freq=int(min_doc_freq))
|
||||
clusters = read_clusters(clusters_path) if clusters_path else {}
|
||||
populate_vocab(vocab, clusters, probs, oov_prob)
|
||||
add_vectors(vocab, vectors_path)
|
||||
create_model(model_path, nlp)
|
||||
|
||||
|
||||
def add_vectors(vocab, vectors_path):
|
||||
with bz2.BZ2File(vectors_path.as_posix()) as f:
|
||||
num_words, dim = next(f).split()
|
||||
vocab.clear_vectors(int(dim))
|
||||
for line in f:
|
||||
word_w_vector = line.decode("utf8").strip().split(" ")
|
||||
word = word_w_vector[0]
|
||||
vector = np.array([float(val) for val in word_w_vector[1:]])
|
||||
if word in vocab:
|
||||
vocab.set_vector(word, vector)
|
||||
|
||||
|
||||
def create_model(model_path, model):
|
||||
if not model_path.exists():
|
||||
model_path.mkdir()
|
||||
model.to_disk(model_path.as_posix())
|
||||
|
||||
|
||||
def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200):
|
||||
counts = PreshCounter()
|
||||
total = 0
|
||||
freqs_file = check_unzip(freqs_path)
|
||||
for i, line in enumerate(freqs_file):
|
||||
freq, doc_freq, key = line.rstrip().split('\t', 2)
|
||||
freq = int(freq)
|
||||
counts.inc(i + 1, freq)
|
||||
total += freq
|
||||
counts.smooth()
|
||||
log_total = math.log(total)
|
||||
freqs_file = check_unzip(freqs_path)
|
||||
probs = {}
|
||||
for line in freqs_file:
|
||||
freq, doc_freq, key = line.rstrip().split('\t', 2)
|
||||
doc_freq = int(doc_freq)
|
||||
freq = int(freq)
|
||||
if doc_freq >= min_doc_freq and freq >= min_freq and len(
|
||||
key) < max_length:
|
||||
word = literal_eval(key)
|
||||
smooth_count = counts.smoother(int(freq))
|
||||
probs[word] = math.log(smooth_count) - log_total
|
||||
oov_prob = math.log(counts.smoother(0)) - log_total
|
||||
return probs, oov_prob
|
||||
|
||||
|
||||
def read_clusters(clusters_path):
|
||||
clusters = {}
|
||||
with clusters_path.open() as f:
|
||||
for line in f:
|
||||
try:
|
||||
cluster, word, freq = line.split()
|
||||
word = fix_text(word)
|
||||
except ValueError:
|
||||
continue
|
||||
# If the clusterer has only seen the word a few times, its
|
||||
# cluster is unreliable.
|
||||
if int(freq) >= 3:
|
||||
clusters[word] = cluster
|
||||
else:
|
||||
clusters[word] = '0'
|
||||
# Expand clusters with re-casing
|
||||
for word, cluster in list(clusters.items()):
|
||||
if word.lower() not in clusters:
|
||||
clusters[word.lower()] = cluster
|
||||
if word.title() not in clusters:
|
||||
clusters[word.title()] = cluster
|
||||
if word.upper() not in clusters:
|
||||
clusters[word.upper()] = cluster
|
||||
return clusters
|
||||
|
||||
|
||||
def populate_vocab(vocab, clusters, probs, oov_prob):
|
||||
for word, prob in reversed(
|
||||
sorted(list(probs.items()), key=lambda item: item[1])):
|
||||
lexeme = vocab[word]
|
||||
lexeme.prob = prob
|
||||
lexeme.is_oov = False
|
||||
# Decode as a little-endian string, so that we can do & 15 to get
|
||||
# the first 4 bits. See _parse_features.pyx
|
||||
if word in clusters:
|
||||
lexeme.cluster = int(clusters[word][::-1], 2)
|
||||
else:
|
||||
lexeme.cluster = 0
|
||||
|
||||
|
||||
def check_unzip(file_path):
|
||||
file_path_str = file_path.as_posix()
|
||||
if file_path_str.endswith('gz'):
|
||||
return gzip.open(file_path_str)
|
||||
else:
|
||||
return file_path.open()
|
||||
|
||||
|
||||
def check_dirs(freqs_data, clusters_data, vectors_data):
|
||||
if not freqs_data.is_file():
|
||||
util.sys_exit(freqs_data.as_posix(), title="No frequencies file found")
|
||||
if clusters_data and not clusters_data.is_file():
|
||||
util.sys_exit(
|
||||
clusters_data.as_posix(), title="No Brown clusters file found")
|
||||
if vectors_data and not vectors_data.is_file():
|
||||
util.sys_exit(
|
||||
vectors_data.as_posix(), title="No word vectors file found")
|
|
@ -15,10 +15,11 @@ from .. import about
|
|||
@plac.annotations(
|
||||
input_dir=("directory with model data", "positional", None, str),
|
||||
output_dir=("output parent directory", "positional", None, str),
|
||||
meta=("path to meta.json", "option", "m", str),
|
||||
meta_path=("path to meta.json", "option", "m", str),
|
||||
create_meta=("create meta.json, even if one exists in directory", "flag", "c", bool),
|
||||
force=("force overwriting of existing folder in output directory", "flag", "f", bool)
|
||||
)
|
||||
def package(cmd, input_dir, output_dir, meta=None, force=False):
|
||||
def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, force=False):
|
||||
"""
|
||||
Generate Python package for model data, including meta and required
|
||||
installation files. A new directory will be created in the specified
|
||||
|
@ -26,7 +27,7 @@ def package(cmd, input_dir, output_dir, meta=None, force=False):
|
|||
"""
|
||||
input_path = util.ensure_path(input_dir)
|
||||
output_path = util.ensure_path(output_dir)
|
||||
meta_path = util.ensure_path(meta)
|
||||
meta_path = util.ensure_path(meta_path)
|
||||
if not input_path or not input_path.exists():
|
||||
prints(input_path, title="Model directory not found", exits=1)
|
||||
if not output_path or not output_path.exists():
|
||||
|
@ -38,7 +39,7 @@ def package(cmd, input_dir, output_dir, meta=None, force=False):
|
|||
template_manifest = get_template('MANIFEST.in')
|
||||
template_init = get_template('xx_model_name/__init__.py')
|
||||
meta_path = meta_path or input_path / 'meta.json'
|
||||
if meta_path.is_file():
|
||||
if not create_meta and meta_path.is_file():
|
||||
prints(meta_path, title="Reading meta.json from file")
|
||||
meta = util.read_json(meta_path)
|
||||
else:
|
||||
|
@ -100,12 +101,15 @@ def generate_meta():
|
|||
def generate_pipeline():
|
||||
prints("If set to 'True', the default pipeline is used. If set to 'False', "
|
||||
"the pipeline will be disabled. Components should be specified as a "
|
||||
"comma-separated list of component names, e.g. vectorizer, tagger, "
|
||||
"comma-separated list of component names, e.g. tensorizer, tagger, "
|
||||
"parser, ner. For more information, see the docs on processing pipelines.",
|
||||
title="Enter your model's pipeline components")
|
||||
pipeline = util.get_raw_input("Pipeline components", True)
|
||||
replace = {'True': True, 'False': False}
|
||||
return replace[pipeline] if pipeline in replace else pipeline.split(', ')
|
||||
subs = {'True': True, 'False': False}
|
||||
if pipeline in subs:
|
||||
return subs[pipeline]
|
||||
else:
|
||||
return [p.strip() for p in pipeline.split(',')]
|
||||
|
||||
|
||||
def validate_meta(meta, keys):
|
||||
|
|
45
spacy/cli/profile.py
Normal file
45
spacy/cli/profile.py
Normal file
|
@ -0,0 +1,45 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals, division, print_function
|
||||
|
||||
import plac
|
||||
from pathlib import Path
|
||||
import ujson
|
||||
import cProfile
|
||||
import pstats
|
||||
|
||||
import spacy
|
||||
import sys
|
||||
import tqdm
|
||||
import cytoolz
|
||||
|
||||
|
||||
def read_inputs(loc):
|
||||
if loc is None:
|
||||
file_ = sys.stdin
|
||||
file_ = (line.encode('utf8') for line in file_)
|
||||
else:
|
||||
file_ = Path(loc).open()
|
||||
for line in file_:
|
||||
data = ujson.loads(line)
|
||||
text = data['text']
|
||||
yield text
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
lang=("model/language", "positional", None, str),
|
||||
inputs=("Location of input file", "positional", None, read_inputs)
|
||||
)
|
||||
def profile(cmd, lang, inputs=None):
|
||||
"""
|
||||
Profile a spaCy pipeline, to find out which functions take the most time.
|
||||
"""
|
||||
nlp = spacy.load(lang)
|
||||
texts = list(cytoolz.take(10000, inputs))
|
||||
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
|
||||
s = pstats.Stats("Profile.prof")
|
||||
s.strip_dirs().sort_stats("time").print_stats()
|
||||
|
||||
|
||||
def parse_texts(nlp, texts):
|
||||
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=128):
|
||||
pass
|
|
@ -8,8 +8,11 @@ import cytoolz
|
|||
from pathlib import Path
|
||||
import dill
|
||||
import tqdm
|
||||
from thinc.neural._classes.model import Model
|
||||
from thinc.neural.optimizers import linear_decay
|
||||
from timeit import default_timer as timer
|
||||
import random
|
||||
import numpy.random
|
||||
|
||||
from ..tokens.doc import Doc
|
||||
from ..scorer import Scorer
|
||||
|
@ -17,9 +20,13 @@ from ..gold import GoldParse, merge_sents
|
|||
from ..gold import GoldCorpus, minibatch
|
||||
from ..util import prints
|
||||
from .. import util
|
||||
from .. import about
|
||||
from .. import displacy
|
||||
from ..compat import json_dumps
|
||||
|
||||
random.seed(0)
|
||||
numpy.random.seed(0)
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
lang=("model language", "positional", None, str),
|
||||
|
@ -29,13 +36,17 @@ from ..compat import json_dumps
|
|||
n_iter=("number of iterations", "option", "n", int),
|
||||
n_sents=("number of sentences", "option", "ns", int),
|
||||
use_gpu=("Use GPU", "option", "g", int),
|
||||
resume=("Whether to resume training", "flag", "R", bool),
|
||||
vectors=("Model to load vectors from", "option", "v"),
|
||||
no_tagger=("Don't train tagger", "flag", "T", bool),
|
||||
no_parser=("Don't train parser", "flag", "P", bool),
|
||||
no_entities=("Don't train NER", "flag", "N", bool)
|
||||
no_entities=("Don't train NER", "flag", "N", bool),
|
||||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||
version=("Model version", "option", "V", str),
|
||||
meta_path=("Optional path to meta.json. All relevant properties will be overwritten.", "option", "m", Path)
|
||||
)
|
||||
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||
use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False):
|
||||
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
||||
use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False,
|
||||
gold_preproc=False, version="0.0.0", meta_path=None):
|
||||
"""
|
||||
Train a model. Expects data in spaCy's JSON format.
|
||||
"""
|
||||
|
@ -44,19 +55,26 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
|||
output_path = util.ensure_path(output_dir)
|
||||
train_path = util.ensure_path(train_data)
|
||||
dev_path = util.ensure_path(dev_data)
|
||||
meta_path = util.ensure_path(meta_path)
|
||||
if not output_path.exists():
|
||||
output_path.mkdir()
|
||||
if not train_path.exists():
|
||||
prints(train_path, title="Training data not found", exits=1)
|
||||
if dev_path and not dev_path.exists():
|
||||
prints(dev_path, title="Development data not found", exits=1)
|
||||
if meta_path is not None and not meta_path.exists():
|
||||
prints(meta_path, title="meta.json not found", exits=1)
|
||||
meta = util.read_json(meta_path) if meta_path else {}
|
||||
if not isinstance(meta, dict):
|
||||
prints("Expected dict but got: {}".format(type(meta)),
|
||||
title="Not a valid meta.json format", exits=1)
|
||||
meta.setdefault('lang', lang)
|
||||
meta.setdefault('name', 'unnamed')
|
||||
|
||||
lang_class = util.get_lang_class(lang)
|
||||
|
||||
pipeline = ['token_vectors', 'tags', 'dependencies', 'entities']
|
||||
if no_tagger and 'tags' in pipeline: pipeline.remove('tags')
|
||||
if no_parser and 'dependencies' in pipeline: pipeline.remove('dependencies')
|
||||
if no_entities and 'entities' in pipeline: pipeline.remove('entities')
|
||||
pipeline = ['tagger', 'parser', 'ner']
|
||||
if no_tagger and 'tagger' in pipeline: pipeline.remove('tagger')
|
||||
if no_parser and 'parser' in pipeline: pipeline.remove('parser')
|
||||
if no_entities and 'ner' in pipeline: pipeline.remove('ner')
|
||||
|
||||
# Take dropout and batch size as generators of values -- dropout
|
||||
# starts high and decays sharply, to force the optimizer to explore.
|
||||
|
@ -66,27 +84,29 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
|||
util.env_opt('dropout_to', 0.2),
|
||||
util.env_opt('dropout_decay', 0.0))
|
||||
batch_sizes = util.compounding(util.env_opt('batch_from', 1),
|
||||
util.env_opt('batch_to', 64),
|
||||
util.env_opt('batch_to', 16),
|
||||
util.env_opt('batch_compound', 1.001))
|
||||
|
||||
if resume:
|
||||
prints(output_path / 'model19.pickle', title="Resuming training")
|
||||
nlp = dill.load((output_path / 'model19.pickle').open('rb'))
|
||||
else:
|
||||
nlp = lang_class(pipeline=pipeline)
|
||||
corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
|
||||
n_train_words = corpus.count_train()
|
||||
|
||||
lang_class = util.get_lang_class(lang)
|
||||
nlp = lang_class()
|
||||
meta['pipeline'] = pipeline
|
||||
nlp.meta.update(meta)
|
||||
if vectors:
|
||||
util.load_model(vectors, vocab=nlp.vocab)
|
||||
for name in pipeline:
|
||||
nlp.add_pipe(nlp.create_pipe(name), name=name)
|
||||
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
|
||||
nlp._optimizer = None
|
||||
|
||||
print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
|
||||
print("Itn.\tP.Loss\tN.Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
|
||||
try:
|
||||
train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0,
|
||||
gold_preproc=gold_preproc, max_length=0)
|
||||
train_docs = list(train_docs)
|
||||
for i in range(n_iter):
|
||||
if resume:
|
||||
i += 20
|
||||
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
||||
train_docs = corpus.train_docs(nlp, projectivize=True,
|
||||
gold_preproc=False, max_length=0)
|
||||
losses = {}
|
||||
for batch in minibatch(train_docs, size=batch_sizes):
|
||||
docs, golds = zip(*batch)
|
||||
|
@ -98,24 +118,51 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
|||
util.set_env_log(False)
|
||||
epoch_model_path = output_path / ('model%d' % i)
|
||||
nlp.to_disk(epoch_model_path)
|
||||
with (output_path / ('model%d.pickle' % i)).open('wb') as file_:
|
||||
dill.dump(nlp, file_, -1)
|
||||
nlp_loaded = lang_class(pipeline=pipeline)
|
||||
nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
|
||||
scorer = nlp_loaded.evaluate(
|
||||
corpus.dev_docs(
|
||||
nlp_loaded = util.load_model_from_path(epoch_model_path)
|
||||
dev_docs = list(corpus.dev_docs(
|
||||
nlp_loaded,
|
||||
gold_preproc=False))
|
||||
gold_preproc=gold_preproc))
|
||||
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
|
||||
start_time = timer()
|
||||
scorer = nlp_loaded.evaluate(dev_docs)
|
||||
end_time = timer()
|
||||
if use_gpu < 0:
|
||||
gpu_wps = None
|
||||
cpu_wps = nwords/(end_time-start_time)
|
||||
else:
|
||||
gpu_wps = nwords/(end_time-start_time)
|
||||
with Model.use_device('cpu'):
|
||||
nlp_loaded = util.load_model_from_path(epoch_model_path)
|
||||
dev_docs = list(corpus.dev_docs(
|
||||
nlp_loaded, gold_preproc=gold_preproc))
|
||||
start_time = timer()
|
||||
scorer = nlp_loaded.evaluate(dev_docs)
|
||||
end_time = timer()
|
||||
cpu_wps = nwords/(end_time-start_time)
|
||||
acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
|
||||
with acc_loc.open('w') as file_:
|
||||
file_.write(json_dumps(scorer.scores))
|
||||
meta_loc = output_path / ('model%d' % i) / 'meta.json'
|
||||
meta['accuracy'] = scorer.scores
|
||||
meta['speed'] = {'nwords': nwords, 'cpu':cpu_wps, 'gpu': gpu_wps}
|
||||
meta['lang'] = nlp.lang
|
||||
meta['pipeline'] = pipeline
|
||||
meta['spacy_version'] = '>=%s' % about.__version__
|
||||
meta.setdefault('name', 'model%d' % i)
|
||||
meta.setdefault('version', version)
|
||||
|
||||
with meta_loc.open('w') as file_:
|
||||
file_.write(json_dumps(meta))
|
||||
util.set_env_log(True)
|
||||
print_progress(i, losses, scorer.scores)
|
||||
print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps)
|
||||
finally:
|
||||
print("Saving model...")
|
||||
with (output_path / 'model-final.pickle').open('wb') as file_:
|
||||
with nlp.use_params(optimizer.averages):
|
||||
dill.dump(nlp, file_, -1)
|
||||
try:
|
||||
with (output_path / 'model-final.pickle').open('wb') as file_:
|
||||
with nlp.use_params(optimizer.averages):
|
||||
dill.dump(nlp, file_, -1)
|
||||
except:
|
||||
print("Error saving model")
|
||||
|
||||
|
||||
def _render_parses(i, to_render):
|
||||
|
@ -128,25 +175,30 @@ def _render_parses(i, to_render):
|
|||
file_.write(html)
|
||||
|
||||
|
||||
def print_progress(itn, losses, dev_scores, wps=0.0):
|
||||
def print_progress(itn, losses, dev_scores, cpu_wps=0.0, gpu_wps=0.0):
|
||||
scores = {}
|
||||
for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
|
||||
'ents_p', 'ents_r', 'ents_f', 'wps']:
|
||||
'ents_p', 'ents_r', 'ents_f', 'cpu_wps', 'gpu_wps']:
|
||||
scores[col] = 0.0
|
||||
scores['dep_loss'] = losses.get('parser', 0.0)
|
||||
scores['ner_loss'] = losses.get('ner', 0.0)
|
||||
scores['tag_loss'] = losses.get('tagger', 0.0)
|
||||
scores.update(dev_scores)
|
||||
scores['wps'] = wps
|
||||
scores['cpu_wps'] = cpu_wps
|
||||
scores['gpu_wps'] = gpu_wps or 0.0
|
||||
tpl = '\t'.join((
|
||||
'{:d}',
|
||||
'{dep_loss:.3f}',
|
||||
'{ner_loss:.3f}',
|
||||
'{uas:.3f}',
|
||||
'{ents_p:.3f}',
|
||||
'{ents_r:.3f}',
|
||||
'{ents_f:.3f}',
|
||||
'{tags_acc:.3f}',
|
||||
'{token_acc:.3f}',
|
||||
'{wps:.1f}'))
|
||||
'{cpu_wps:.1f}',
|
||||
'{gpu_wps:.1f}',
|
||||
))
|
||||
print(tpl.format(itn, **scores))
|
||||
|
||||
|
||||
|
|
123
spacy/cli/validate.py
Normal file
123
spacy/cli/validate.py
Normal file
|
@ -0,0 +1,123 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import requests
|
||||
import pkg_resources
|
||||
from pathlib import Path
|
||||
|
||||
from ..compat import path2str, locale_escape
|
||||
from ..util import prints, get_data_path, read_json
|
||||
from .. import about
|
||||
|
||||
|
||||
def validate(cmd):
|
||||
"""Validate that the currently installed version of spaCy is compatible
|
||||
with the installed models. Should be run after `pip install -U spacy`.
|
||||
"""
|
||||
r = requests.get(about.__compatibility__)
|
||||
if r.status_code != 200:
|
||||
prints("Couldn't fetch compatibility table.",
|
||||
title="Server error (%d)" % r.status_code, exits=1)
|
||||
compat = r.json()['spacy']
|
||||
all_models = set()
|
||||
for spacy_v, models in dict(compat).items():
|
||||
all_models.update(models.keys())
|
||||
for model, model_vs in models.items():
|
||||
compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
|
||||
|
||||
current_compat = compat[about.__version__]
|
||||
model_links = get_model_links(current_compat)
|
||||
model_pkgs = get_model_pkgs(current_compat, all_models)
|
||||
incompat_links = {l for l, d in model_links.items() if not d['compat']}
|
||||
incompat_models = {d['name'] for _, d in model_pkgs.items() if not d['compat']}
|
||||
incompat_models.update([d['name'] for _, d in model_links.items() if not d['compat']])
|
||||
na_models = [m for m in incompat_models if m not in current_compat]
|
||||
update_models = [m for m in incompat_models if m in current_compat]
|
||||
|
||||
prints(path2str(Path(__file__).parent.parent),
|
||||
title="Installed models (spaCy v{})".format(about.__version__))
|
||||
if model_links or model_pkgs:
|
||||
print(get_row('TYPE', 'NAME', 'MODEL', 'VERSION', ''))
|
||||
for name, data in model_pkgs.items():
|
||||
print(get_model_row(current_compat, name, data, 'package'))
|
||||
for name, data in model_links.items():
|
||||
print(get_model_row(current_compat, name, data, 'link'))
|
||||
else:
|
||||
prints("No models found in your current environment.", exits=0)
|
||||
|
||||
if update_models:
|
||||
cmd = ' python -m spacy download {}'
|
||||
print("\n Use the following commands to update the model packages:")
|
||||
print('\n'.join([cmd.format(pkg) for pkg in update_models]))
|
||||
|
||||
if na_models:
|
||||
prints("The following models are not available for spaCy v{}: {}"
|
||||
.format(about.__version__, ', '.join(na_models)))
|
||||
|
||||
if incompat_links:
|
||||
prints("You may also want to overwrite the incompatible links using "
|
||||
"the `spacy link` command with `--force`, or remove them from "
|
||||
"the data directory. Data path: {}"
|
||||
.format(path2str(get_data_path())))
|
||||
|
||||
|
||||
def get_model_links(compat):
|
||||
links = {}
|
||||
data_path = get_data_path()
|
||||
if data_path:
|
||||
models = [p for p in data_path.iterdir() if is_model_path(p)]
|
||||
for model in models:
|
||||
meta_path = Path(model) / 'meta.json'
|
||||
if not meta_path.exists():
|
||||
continue
|
||||
meta = read_json(meta_path)
|
||||
link = model.parts[-1]
|
||||
name = meta['lang'] + '_' + meta['name']
|
||||
links[link] = {'name': name, 'version': meta['version'],
|
||||
'compat': is_compat(compat, name, meta['version'])}
|
||||
return links
|
||||
|
||||
|
||||
def get_model_pkgs(compat, all_models):
|
||||
pkgs = {}
|
||||
for pkg_name, pkg_data in pkg_resources.working_set.by_key.items():
|
||||
package = pkg_name.replace('-', '_')
|
||||
if package in all_models:
|
||||
version = pkg_data.version
|
||||
pkgs[pkg_name] = {'name': package, 'version': version,
|
||||
'compat': is_compat(compat, package, version)}
|
||||
return pkgs
|
||||
|
||||
|
||||
def get_model_row(compat, name, data, type='package'):
|
||||
tpl_row = ' {:<10}' + (' {:<20}' * 4)
|
||||
tpl_red = '\x1b[38;5;1m{}\x1b[0m'
|
||||
tpl_green = '\x1b[38;5;2m{}\x1b[0m'
|
||||
if data['compat']:
|
||||
comp = tpl_green.format(locale_escape('✔', errors='ignore'))
|
||||
version = tpl_green.format(data['version'])
|
||||
else:
|
||||
comp = '--> {}'.format(compat.get(data['name'], ['n/a'])[0])
|
||||
version = tpl_red.format(data['version'])
|
||||
return get_row(type, name, data['name'], version, comp)
|
||||
|
||||
|
||||
def get_row(*args):
|
||||
tpl_row = ' {:<10}' + (' {:<20}' * 4)
|
||||
return tpl_row.format(*args)
|
||||
|
||||
|
||||
def is_model_path(model_path):
|
||||
exclude = ['cache', 'pycache', '__pycache__']
|
||||
name = model_path.parts[-1]
|
||||
return model_path.is_dir() and name not in exclude and not name.startswith('.')
|
||||
|
||||
|
||||
def is_compat(compat, name, version):
|
||||
return name in compat and version in compat[name]
|
||||
|
||||
|
||||
def reformat_version(version):
|
||||
if version.endswith('-alpha'):
|
||||
return version.replace('-alpha', 'a0')
|
||||
return version.replace('-alpha', 'a')
|
|
@ -5,6 +5,8 @@ import six
|
|||
import ftfy
|
||||
import sys
|
||||
import ujson
|
||||
import itertools
|
||||
import locale
|
||||
|
||||
from thinc.neural.util import copy_array
|
||||
|
||||
|
@ -35,6 +37,7 @@ CudaStream = CudaStream
|
|||
cupy = cupy
|
||||
fix_text = ftfy.fix_text
|
||||
copy_array = copy_array
|
||||
izip = getattr(itertools, 'izip', zip)
|
||||
|
||||
is_python2 = six.PY2
|
||||
is_python3 = six.PY3
|
||||
|
@ -44,21 +47,31 @@ is_osx = sys.platform == 'darwin'
|
|||
|
||||
|
||||
if is_python2:
|
||||
import imp
|
||||
bytes_ = str
|
||||
unicode_ = unicode
|
||||
basestring_ = basestring
|
||||
input_ = raw_input
|
||||
json_dumps = lambda data: ujson.dumps(data, indent=2).decode('utf8')
|
||||
json_dumps = lambda data: ujson.dumps(data, indent=2, escape_forward_slashes=False).decode('utf8')
|
||||
path2str = lambda path: str(path).decode('utf8')
|
||||
|
||||
elif is_python3:
|
||||
import importlib.util
|
||||
bytes_ = bytes
|
||||
unicode_ = str
|
||||
basestring_ = str
|
||||
input_ = input
|
||||
json_dumps = lambda data: ujson.dumps(data, indent=2)
|
||||
json_dumps = lambda data: ujson.dumps(data, indent=2, escape_forward_slashes=False)
|
||||
path2str = lambda path: str(path)
|
||||
|
||||
|
||||
def b_to_str(b_str):
|
||||
if is_python2:
|
||||
return b_str
|
||||
# important: if no encoding is set, string becomes "b'...'"
|
||||
return str(b_str, encoding='utf8')
|
||||
|
||||
|
||||
def getattr_(obj, name, *default):
|
||||
if is_python3 and isinstance(name, bytes):
|
||||
name = name.decode('utf8')
|
||||
|
@ -92,3 +105,21 @@ def normalize_string_keys(old):
|
|||
return new
|
||||
|
||||
|
||||
def import_file(name, loc):
|
||||
loc = str(loc)
|
||||
if is_python2:
|
||||
return imp.load_source(name, loc)
|
||||
else:
|
||||
spec = importlib.util.spec_from_file_location(name, str(loc))
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(module)
|
||||
return module
|
||||
|
||||
|
||||
def locale_escape(string, errors='replace'):
|
||||
'''
|
||||
Mangle non-supported characters, for savages with ascii terminals.
|
||||
'''
|
||||
encoding = locale.getpreferredencoding()
|
||||
string = string.encode(encoding, errors).decode('utf8')
|
||||
return string
|
||||
|
|
|
@ -15,7 +15,7 @@ def depr_model_download(lang):
|
|||
lang (unicode): Language shortcut, 'en' or 'de'.
|
||||
"""
|
||||
prints("The spacy.%s.download command is now deprecated. Please use "
|
||||
"python -m spacy download [model name or shortcut] instead. For "
|
||||
"spacy download [model name or shortcut] instead. For "
|
||||
"more info, see the documentation:" % lang,
|
||||
about.__docs_models__,
|
||||
"Downloading default '%s' model now..." % lang,
|
||||
|
|
|
@ -3,6 +3,7 @@ from __future__ import unicode_literals
|
|||
|
||||
from .render import DependencyRenderer, EntityRenderer
|
||||
from ..tokens import Doc
|
||||
from ..compat import b_to_str
|
||||
from ..util import prints, is_in_jupyter
|
||||
|
||||
|
||||
|
@ -65,7 +66,9 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
|
|||
|
||||
|
||||
def app(environ, start_response):
|
||||
start_response('200 OK', [('Content-type', 'text/html; charset=utf-8')])
|
||||
# headers and status need to be bytes in Python 2, see #1227
|
||||
headers = [(b_to_str(b'Content-type'), b_to_str(b'text/html; charset=utf-8'))]
|
||||
start_response(b_to_str(b'200 OK'), headers)
|
||||
res = _html['parsed'].encode(encoding='utf-8')
|
||||
return [res]
|
||||
|
||||
|
|
|
@ -60,7 +60,7 @@ GLOSSARY = {
|
|||
'JJR': 'adjective, comparative',
|
||||
'JJS': 'adjective, superlative',
|
||||
'LS': 'list item marker',
|
||||
'MD': 'verb, modal auxillary',
|
||||
'MD': 'verb, modal auxiliary',
|
||||
'NIL': 'missing tag',
|
||||
'NN': 'noun, singular or mass',
|
||||
'NNP': 'noun, proper singular',
|
||||
|
@ -91,7 +91,7 @@ GLOSSARY = {
|
|||
'NFP': 'superfluous punctuation',
|
||||
'GW': 'additional word in multi-word expression',
|
||||
'XX': 'unknown',
|
||||
'BES': 'auxillary "be"',
|
||||
'BES': 'auxiliary "be"',
|
||||
'HVS': 'forms of "have"',
|
||||
|
||||
|
||||
|
|
|
@ -9,6 +9,7 @@ cdef struct GoldParseC:
|
|||
int* tags
|
||||
int* heads
|
||||
int* has_dep
|
||||
int* sent_start
|
||||
attr_t* labels
|
||||
int** brackets
|
||||
Transition* ner
|
||||
|
@ -29,6 +30,7 @@ cdef class GoldParse:
|
|||
cdef public list ner
|
||||
cdef public list ents
|
||||
cdef public dict brackets
|
||||
cdef public object cats
|
||||
|
||||
cdef readonly list cand_to_gold
|
||||
cdef readonly list gold_to_cand
|
||||
|
|
|
@ -7,6 +7,7 @@ import re
|
|||
import ujson
|
||||
import random
|
||||
import cytoolz
|
||||
import itertools
|
||||
|
||||
from .syntax import nonproj
|
||||
from .util import ensure_path
|
||||
|
@ -146,9 +147,13 @@ def minibatch(items, size=8):
|
|||
'''Iterate over batches of items. `size` may be an iterator,
|
||||
so that batch-size can vary on each step.
|
||||
'''
|
||||
if isinstance(size, int):
|
||||
size_ = itertools.repeat(8)
|
||||
else:
|
||||
size_ = size
|
||||
items = iter(items)
|
||||
while True:
|
||||
batch_size = next(size) #if hasattr(size, '__next__') else size
|
||||
batch_size = next(size_)
|
||||
batch = list(cytoolz.take(int(batch_size), items))
|
||||
if len(batch) == 0:
|
||||
break
|
||||
|
@ -208,7 +213,7 @@ class GoldCorpus(object):
|
|||
train_tuples = self.train_tuples
|
||||
if projectivize:
|
||||
train_tuples = nonproj.preprocess_training_data(
|
||||
self.train_tuples)
|
||||
self.train_tuples, label_freq_cutoff=100)
|
||||
random.shuffle(train_tuples)
|
||||
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
|
||||
max_length=max_length,
|
||||
|
@ -381,7 +386,8 @@ cdef class GoldParse:
|
|||
make_projective=make_projective)
|
||||
|
||||
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
|
||||
deps=None, entities=None, make_projective=False):
|
||||
deps=None, entities=None, make_projective=False,
|
||||
cats=None):
|
||||
"""Create a GoldParse.
|
||||
|
||||
doc (Doc): The document the annotations refer to.
|
||||
|
@ -392,6 +398,15 @@ cdef class GoldParse:
|
|||
entities (iterable): A sequence of named entity annotations, either as
|
||||
BILUO tag strings, or as `(start_char, end_char, label)` tuples,
|
||||
representing the entity positions.
|
||||
cats (dict): Labels for text classification. Each key in the dictionary
|
||||
may be a string or an int, or a `(start_char, end_char, label)`
|
||||
tuple, indicating that the label is applied to only part of the
|
||||
document (usually a sentence). Unlike entity annotations, label
|
||||
annotations can overlap, i.e. a single word can be covered by
|
||||
multiple labelled spans. The TextCategorizer component expects
|
||||
true examples of a label to have the value 1.0, and negative examples
|
||||
of a label to have the value 0.0. Labels not in the dictionary are
|
||||
treated as missing -- the gradient for those labels will be zero.
|
||||
RETURNS (GoldParse): The newly constructed object.
|
||||
"""
|
||||
if words is None:
|
||||
|
@ -399,11 +414,11 @@ cdef class GoldParse:
|
|||
if tags is None:
|
||||
tags = [None for _ in doc]
|
||||
if heads is None:
|
||||
heads = [token.i for token in doc]
|
||||
heads = [None for token in doc]
|
||||
if deps is None:
|
||||
deps = [None for _ in doc]
|
||||
if entities is None:
|
||||
entities = ['-' for _ in doc]
|
||||
entities = [None for _ in doc]
|
||||
elif len(entities) == 0:
|
||||
entities = ['O' for _ in doc]
|
||||
elif not isinstance(entities[0], basestring):
|
||||
|
@ -419,8 +434,10 @@ cdef class GoldParse:
|
|||
self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||
self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
|
||||
self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||
self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||
self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
|
||||
|
||||
self.cats = {} if cats is None else dict(cats)
|
||||
self.words = [None] * len(doc)
|
||||
self.tags = [None] * len(doc)
|
||||
self.heads = [None] * len(doc)
|
||||
|
@ -474,8 +491,12 @@ cdef class GoldParse:
|
|||
"""
|
||||
return not nonproj.is_nonproj_tree(self.heads)
|
||||
|
||||
@property
|
||||
def sent_starts(self):
|
||||
return [self.c.sent_start[i] for i in range(self.length)]
|
||||
|
||||
def biluo_tags_from_offsets(doc, entities):
|
||||
|
||||
def biluo_tags_from_offsets(doc, entities, missing='O'):
|
||||
"""Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
|
||||
scheme (BILUO).
|
||||
|
||||
|
@ -527,7 +548,7 @@ def biluo_tags_from_offsets(doc, entities):
|
|||
if i in entity_chars:
|
||||
break
|
||||
else:
|
||||
biluo[token.i] = 'O'
|
||||
biluo[token.i] = missing
|
||||
return biluo
|
||||
|
||||
|
||||
|
|
|
@ -16,15 +16,13 @@ from ...util import update_exc
|
|||
class BengaliDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'bn'
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tag_map = TAG_MAP
|
||||
stop_words = STOP_WORDS
|
||||
lemma_rules = LEMMA_RULES
|
||||
|
||||
prefixes = tuple(TOKENIZER_PREFIXES)
|
||||
suffixes = tuple(TOKENIZER_SUFFIXES)
|
||||
infixes = tuple(TOKENIZER_INFIXES)
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
|
||||
|
||||
class Bengali(Language):
|
||||
|
|
|
@ -27,13 +27,21 @@ ALPHA_UPPER = merge_char_classes(_upper + _uncased)
|
|||
|
||||
_units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft '
|
||||
'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb '
|
||||
'TB T G M K')
|
||||
'TB T G M K %')
|
||||
_currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
|
||||
_punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &'
|
||||
_quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «'
|
||||
_hyphens = '- – — -- ---'
|
||||
|
||||
# These expressions contain various unicode variations, including characters
|
||||
# used in Chinese (see #1333, #1340, #1351) – unless there are cross-language
|
||||
# conflicts, spaCy's base tokenizer should handle all of those by default
|
||||
_punct = r'… …… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ · ।'
|
||||
_quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‘‘ ’’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉'
|
||||
_hyphens = '- – — -- --- —— ~'
|
||||
|
||||
# Various symbols like dingbats, but also emoji
|
||||
# Details: https://www.compart.com/en/unicode/category/So
|
||||
_other_symbols = r'[\p{So}]'
|
||||
|
||||
|
||||
UNITS = merge_chars(_units)
|
||||
CURRENCY = merge_chars(_currency)
|
||||
QUOTES = merge_chars(_quotes)
|
||||
|
|
|
@ -19,11 +19,10 @@ class DanishDefaults(Language.Defaults):
|
|||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: 'da'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
#morph_rules = dict(MORPH_RULES)
|
||||
tag_map = dict(TAG_MAP)
|
||||
stop_words = set(STOP_WORDS)
|
||||
# morph_rules = MORPH_RULES
|
||||
tag_map = TAG_MAP
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Danish(Language):
|
||||
|
|
18
spacy/lang/da/examples.py
Normal file
18
spacy/lang/da/examples.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.da.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple overvejer at købe et britisk statup for 1 milliard dollar",
|
||||
"Selvkørende biler flytter forsikringsansvaret over på producenterne",
|
||||
"San Francisco overvejer at forbyde leverandørrobotter på fortov",
|
||||
"London er en stor by i Storbritannien"
|
||||
]
|
|
@ -3,6 +3,7 @@ from __future__ import unicode_literals
|
|||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .norm_exceptions import NORM_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
from .tag_map import TAG_MAP
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lemmatizer import LOOKUP
|
||||
|
@ -11,7 +12,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lemmatizerlookup import Lemmatizer
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
@ -21,15 +21,12 @@ class GermanDefaults(Language.Defaults):
|
|||
lex_attr_getters[LANG] = lambda text: 'de'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
|
||||
NORM_EXCEPTIONS, BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tag_map = dict(TAG_MAP)
|
||||
stop_words = set(STOP_WORDS)
|
||||
syntax_iterators = dict(SYNTAX_ITERATORS)
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return Lemmatizer(LOOKUP)
|
||||
infixes = TOKENIZER_INFIXES
|
||||
tag_map = TAG_MAP
|
||||
stop_words = STOP_WORDS
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
lemma_lookup = LOOKUP
|
||||
|
||||
|
||||
class German(Language):
|
||||
|
|
22
spacy/lang/de/examples.py
Normal file
22
spacy/lang/de/examples.py
Normal file
|
@ -0,0 +1,22 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.de.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen",
|
||||
"Wie deutsche Startups die Technologie vorantreiben wollen: Künstliche Intelligenz",
|
||||
"Trend zum Urlaub in Deutschland beschert Gastwirten mehr Umsatz",
|
||||
"Bundesanwaltschaft erhebt Anklage gegen mutmaßlichen Schweizer Spion",
|
||||
"San Francisco erwägt Verbot von Lieferrobotern",
|
||||
"Autonome Fahrzeuge verlagern Haftpflicht auf Hersteller",
|
||||
"Wo bist du?",
|
||||
"Was ist die Hauptstadt von Deutschland?"
|
||||
]
|
20
spacy/lang/de/punctuation.py
Normal file
20
spacy/lang/de/punctuation.py
Normal file
|
@ -0,0 +1,20 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
|
||||
from ..char_classes import QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||
|
||||
|
||||
_quotes = QUOTES.replace("'", '')
|
||||
|
||||
_infixes = (LIST_ELLIPSES + LIST_ICONS +
|
||||
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
||||
r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}])([{q}\)\]\(\[])(?=[\{a}])'.format(a=ALPHA, q=_quotes),
|
||||
r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[0-9])-(?=[0-9])'])
|
||||
|
||||
|
||||
TOKENIZER_INFIXES = _infixes
|
|
@ -62,5 +62,5 @@ TAG_MAP = {
|
|||
"VVIZU": {POS: VERB, "VerbForm": "inf"},
|
||||
"VVPP": {POS: VERB, "Aspect": "perf", "VerbForm": "part"},
|
||||
"XY": {POS: X},
|
||||
"SP": {POS: SPACE}
|
||||
"_SP": {POS: SPACE}
|
||||
}
|
||||
|
|
|
@ -7,7 +7,7 @@ from .tag_map import TAG_MAP
|
|||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .morph_rules import MORPH_RULES
|
||||
from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC
|
||||
from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC, LOOKUP
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
|
@ -16,22 +16,24 @@ from ...language import Language
|
|||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
def _return_en(_):
|
||||
return 'en'
|
||||
|
||||
class EnglishDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: 'en'
|
||||
lex_attr_getters[LANG] = _return_en
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
|
||||
BASE_NORMS, NORM_EXCEPTIONS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tag_map = dict(TAG_MAP)
|
||||
stop_words = set(STOP_WORDS)
|
||||
morph_rules = dict(MORPH_RULES)
|
||||
lemma_rules = dict(LEMMA_RULES)
|
||||
lemma_index = dict(LEMMA_INDEX)
|
||||
lemma_exc = dict(LEMMA_EXC)
|
||||
syntax_iterators = dict(SYNTAX_ITERATORS)
|
||||
tag_map = TAG_MAP
|
||||
stop_words = STOP_WORDS
|
||||
morph_rules = MORPH_RULES
|
||||
lemma_rules = LEMMA_RULES
|
||||
lemma_index = LEMMA_INDEX
|
||||
lemma_exc = LEMMA_EXC
|
||||
lemma_lookup = LOOKUP
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
|
||||
|
||||
class English(Language):
|
||||
|
|
22
spacy/lang/en/examples.py
Normal file
22
spacy/lang/en/examples.py
Normal file
|
@ -0,0 +1,22 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.en.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple is looking at buying U.K. startup for $1 billion",
|
||||
"Autonomous cars shift insurance liability toward manufacturers",
|
||||
"San Francisco considers banning sidewalk delivery robots",
|
||||
"London is a big city in the United Kingdom.",
|
||||
"Where are you?",
|
||||
"Who is the president of France?",
|
||||
"What is the capital of the United States?",
|
||||
"When was Barack Obama born?"
|
||||
]
|
|
@ -59,7 +59,8 @@ MORPH_RULES = {
|
|||
|
||||
"VBP": {
|
||||
"are": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"},
|
||||
"'re": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"}
|
||||
"'re": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"},
|
||||
"am": {LEMMA: "be", "VerbForm": "Fin", "Person": "One", "Tense": "Pres", "Mood": "Ind"},
|
||||
},
|
||||
|
||||
"VBD": {
|
||||
|
|
|
@ -8,7 +8,7 @@ def noun_chunks(obj):
|
|||
"""
|
||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||
"""
|
||||
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
|
||||
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'dative', 'appos',
|
||||
'attr', 'ROOT']
|
||||
doc = obj.doc # Ensure works on both Doc and Span.
|
||||
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
||||
|
|
|
@ -55,11 +55,11 @@ TAG_MAP = {
|
|||
"WP": {POS: NOUN, "PronType": "int|rel"},
|
||||
"WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
|
||||
"WRB": {POS: ADV, "PronType": "int|rel"},
|
||||
"SP": {POS: SPACE},
|
||||
"ADD": {POS: X},
|
||||
"NFP": {POS: PUNCT},
|
||||
"GW": {POS: X},
|
||||
"XX": {POS: X},
|
||||
"BES": {POS: VERB},
|
||||
"HVS": {POS: VERB}
|
||||
"HVS": {POS: VERB},
|
||||
"_SP": {POS: SPACE},
|
||||
}
|
||||
|
|
|
@ -232,7 +232,10 @@ for verb_data in [
|
|||
{ORTH: "are", LEMMA: "be", NORM: "are", TAG: "VBP", "number": 2},
|
||||
{ORTH: "is", LEMMA: "be", NORM: "is", TAG: "VBZ"},
|
||||
{ORTH: "was", LEMMA: "be", NORM: "was"},
|
||||
{ORTH: "were", LEMMA: "be", NORM: "were"}]:
|
||||
{ORTH: "were", LEMMA: "be", NORM: "were"},
|
||||
{ORTH: "have", NORM: "have"},
|
||||
{ORTH: "has", LEMMA: "have", NORM: "has"},
|
||||
{ORTH: "dare", NORM: "dare"}]:
|
||||
verb_data_tc = dict(verb_data)
|
||||
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
|
||||
for data in [verb_data, verb_data_tc]:
|
||||
|
|
|
@ -10,7 +10,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lemmatizerlookup import Lemmatizer
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
@ -19,15 +18,11 @@ class SpanishDefaults(Language.Defaults):
|
|||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'es'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tag_map = dict(TAG_MAP)
|
||||
stop_words = set(STOP_WORDS)
|
||||
sytax_iterators = dict(SYNTAX_ITERATORS)
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return Lemmatizer(LOOKUP)
|
||||
tag_map = TAG_MAP
|
||||
stop_words = STOP_WORDS
|
||||
sytax_iterators = SYNTAX_ITERATORS
|
||||
lemma_lookup = LOOKUP
|
||||
|
||||
|
||||
class Spanish(Language):
|
||||
|
|
22
spacy/lang/es/examples.py
Normal file
22
spacy/lang/es/examples.py
Normal file
|
@ -0,0 +1,22 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.es.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple está buscando comprar una startup del Reino Unido por mil millones de dólares",
|
||||
"Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes",
|
||||
"San Francisco analiza prohibir los robots delivery",
|
||||
"Londres es una gran ciudad del Reino Unido",
|
||||
"El gato come pescado",
|
||||
"Veo al hombre con el telescopio",
|
||||
"La araña come moscas",
|
||||
"El pingüino incuba en su nido"
|
||||
]
|
|
@ -303,5 +303,5 @@ TAG_MAP = {
|
|||
"VERB__VerbForm=Ger": {"morph": "VerbForm=Ger", "pos": "VERB"},
|
||||
"VERB__VerbForm=Inf": {"morph": "VerbForm=Inf", "pos": "VERB"},
|
||||
"X___": {"morph": "_", "pos": "X"},
|
||||
"SP": {"morph": "_", "pos": "SPACE"},
|
||||
"_SP": {"morph": "_", "pos": "SPACE"},
|
||||
}
|
||||
|
|
|
@ -15,9 +15,8 @@ class FinnishDefaults(Language.Defaults):
|
|||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'fi'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Finnish(Language):
|
||||
|
|
|
@ -4,32 +4,29 @@ from __future__ import unicode_literals
|
|||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
||||
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .lemmatizer import LOOKUP
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lemmatizerlookup import Lemmatizer
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class FrenchDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: 'fr'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
infixes = tuple(TOKENIZER_INFIXES)
|
||||
suffixes = tuple(TOKENIZER_SUFFIXES)
|
||||
stop_words = STOP_WORDS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
token_match = TOKEN_MATCH
|
||||
syntax_iterators = dict(SYNTAX_ITERATORS)
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return Lemmatizer(LOOKUP)
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
lemma_lookup = LOOKUP
|
||||
|
||||
|
||||
class French(Language):
|
||||
|
|
26
spacy/lang/fr/examples.py
Normal file
26
spacy/lang/fr/examples.py
Normal file
|
@ -0,0 +1,26 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.fr.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple cherche a acheter une startup anglaise pour 1 milliard de dollard",
|
||||
"Les voitures autonomes voient leur assurances décalées vers les constructeurs",
|
||||
"San Francisco envisage d'interdire les robots coursiers",
|
||||
"Londres est une grande ville du Royaume-Uni",
|
||||
"L’Italie choisit ArcelorMittal pour reprendre la plus grande aciérie d’Europe",
|
||||
"Apple lance HomePod parce qu'il se sent menacé par l'Echo d'Amazon",
|
||||
"La France ne devrait pas manquer d'électricité cet été, même en cas de canicule",
|
||||
"Nouvelles attaques de Trump contre le maire de Londres",
|
||||
"Où es-tu ?",
|
||||
"Qui est le président de la France ?",
|
||||
"Où est la capitale des Etats-Unis ?",
|
||||
"Quand est né Barack Obama ?"
|
||||
]
|
41
spacy/lang/fr/lex_attrs.py
Normal file
41
spacy/lang/fr/lex_attrs.py
Normal file
|
@ -0,0 +1,41 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...attrs import LIKE_NUM
|
||||
|
||||
|
||||
_num_words = set("""
|
||||
zero un deux trois quatre cinq six sept huit neuf dix
|
||||
onze douze treize quatorze quinze seize dix-sept dix-huit dix-neuf
|
||||
vingt trente quanrante cinquante soixante septante quatre-vingt huitante nonante
|
||||
cent mille mil million milliard billion quadrillion quintillion
|
||||
sextillion septillion octillion nonillion decillion
|
||||
""".split())
|
||||
|
||||
_ordinal_words = set("""
|
||||
premier deuxième second troisième quatrième cinquième sixième septième huitième neuvième dixième
|
||||
onzième douzième treizième quatorzième quinzième seizième dix-septième dix-huitième dix-neufième
|
||||
vingtième trentième quanrantième cinquantième soixantième septantième quatre-vingtième huitantième nonantième
|
||||
centième millième millionnième milliardième billionnième quadrillionnième quintillionnième
|
||||
sextillionnième septillionnième octillionnième nonillionnième decillionnième
|
||||
""".split())
|
||||
|
||||
|
||||
def like_num(text):
|
||||
# Might require more work?
|
||||
# See this discussion: https://github.com/explosion/spaCy/pull/1161
|
||||
text = text.replace(',', '').replace('.', '')
|
||||
if text.isdigit():
|
||||
return True
|
||||
if text.count('/') == 1:
|
||||
num, denom = text.split('/')
|
||||
if num.isdigit() and denom.isdigit():
|
||||
return True
|
||||
if text in _num_words:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
LEX_ATTRS = {
|
||||
LIKE_NUM: like_num
|
||||
}
|
|
@ -12,9 +12,8 @@ from ...util import update_exc
|
|||
class HebrewDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'he'
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Hebrew(Language):
|
||||
|
|
28
spacy/lang/he/examples.py
Normal file
28
spacy/lang/he/examples.py
Normal file
|
@ -0,0 +1,28 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.he.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
'סין מקימה קרן של 440 מיליון דולר להשקעה בהייטק בישראל',
|
||||
'רה"מ הודיע כי יחרים טקס בחסותו',
|
||||
'הכנסת צפויה לאשר איכון אוטומטי של שיחות למוקד 100',
|
||||
'תוכנית לאומית תהפוך את ישראל למעצמה דיגיטלית',
|
||||
'סע לשלום, המפתחות בפנים.',
|
||||
'מלצר, פעמיים טורקי!',
|
||||
'ואהבת לרעך כמוך.',
|
||||
'היום נעשה משהו בלתי נשכח.',
|
||||
'איפה הילד?',
|
||||
'מיהו נשיא צרפת?',
|
||||
'מהי בירת ארצות הברית?',
|
||||
"איך קוראים בעברית לצ'ופצ'יק של הקומקום?",
|
||||
'מה הייתה הדקה?',
|
||||
'מי אומר שלום ראשון, זה שעולה או זה שיורד?'
|
||||
]
|
24
spacy/lang/hi/__init__.py
Normal file
24
spacy/lang/hi/__init__.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
|
||||
|
||||
class HindiDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: 'hi'
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Hindi(Language):
|
||||
lang = 'hi'
|
||||
Defaults = HindiDefaults
|
||||
|
||||
|
||||
__all__ = ['Hindi']
|
38
spacy/lang/hi/lex_attrs.py
Normal file
38
spacy/lang/hi/lex_attrs.py
Normal file
|
@ -0,0 +1,38 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...attrs import NORM
|
||||
from ...util import add_lookups
|
||||
|
||||
|
||||
_stem_suffixes = [
|
||||
["ो","े","ू","ु","ी","ि","ा"],
|
||||
["कर","ाओ","िए","ाई","ाए","ने","नी","ना","ते","ीं","ती","ता","ाँ","ां","ों","ें"],
|
||||
["ाकर","ाइए","ाईं","ाया","ेगी","ेगा","ोगी","ोगे","ाने","ाना","ाते","ाती","ाता","तीं","ाओं","ाएं","ुओं","ुएं","ुआं"],
|
||||
["ाएगी","ाएगा","ाओगी","ाओगे","एंगी","ेंगी","एंगे","ेंगे","ूंगी","ूंगा","ातीं","नाओं","नाएं","ताओं","ताएं","ियाँ","ियों","ियां"],
|
||||
["ाएंगी","ाएंगे","ाऊंगी","ाऊंगा","ाइयाँ","ाइयों","ाइयां"]
|
||||
]
|
||||
|
||||
|
||||
def norm(string):
|
||||
# normalise base exceptions, e.g. punctuation or currency symbols
|
||||
if string in BASE_NORMS:
|
||||
return BASE_NORMS[string]
|
||||
# set stem word as norm, if available, adapted from:
|
||||
# http://computing.open.ac.uk/Sites/EACLSouthAsia/Papers/p6-Ramanathan.pdf
|
||||
# http://research.variancia.com/hindi_stemmer/
|
||||
# https://github.com/taranjeet/hindi-tokenizer/blob/master/HindiTokenizer.py#L142
|
||||
for suffix_group in reversed(_stem_suffixes):
|
||||
length = len(suffix_group[0])
|
||||
if len(string) <= length:
|
||||
break
|
||||
for suffix in suffix_group:
|
||||
if string.endswith(suffix):
|
||||
return string[:-length]
|
||||
return string
|
||||
|
||||
|
||||
LEX_ATTRS = {
|
||||
NORM: norm
|
||||
}
|
177
spacy/lang/hi/stop_words.py
Normal file
177
spacy/lang/hi/stop_words.py
Normal file
|
@ -0,0 +1,177 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
# Source: https://github.com/taranjeet/hindi-tokenizer/blob/master/stopwords.txt
|
||||
|
||||
STOP_WORDS = set("""
|
||||
अत
|
||||
अपना
|
||||
अपनी
|
||||
अपने
|
||||
अभी
|
||||
अंदर
|
||||
आदि
|
||||
आप
|
||||
इत्यादि
|
||||
इन
|
||||
इनका
|
||||
इन्हीं
|
||||
इन्हें
|
||||
इन्हों
|
||||
इस
|
||||
इसका
|
||||
इसकी
|
||||
इसके
|
||||
इसमें
|
||||
इसी
|
||||
इसे
|
||||
उन
|
||||
उनका
|
||||
उनकी
|
||||
उनके
|
||||
उनको
|
||||
उन्हीं
|
||||
उन्हें
|
||||
उन्हों
|
||||
उस
|
||||
उसके
|
||||
उसी
|
||||
उसे
|
||||
एक
|
||||
एवं
|
||||
एस
|
||||
ऐसे
|
||||
और
|
||||
कई
|
||||
कर
|
||||
करता
|
||||
करते
|
||||
करना
|
||||
करने
|
||||
करें
|
||||
कहते
|
||||
कहा
|
||||
का
|
||||
काफ़ी
|
||||
कि
|
||||
कितना
|
||||
किन्हें
|
||||
किन्हों
|
||||
किया
|
||||
किर
|
||||
किस
|
||||
किसी
|
||||
किसे
|
||||
की
|
||||
कुछ
|
||||
कुल
|
||||
के
|
||||
को
|
||||
कोई
|
||||
कौन
|
||||
कौनसा
|
||||
गया
|
||||
घर
|
||||
जब
|
||||
जहाँ
|
||||
जा
|
||||
जितना
|
||||
जिन
|
||||
जिन्हें
|
||||
जिन्हों
|
||||
जिस
|
||||
जिसे
|
||||
जीधर
|
||||
जैसा
|
||||
जैसे
|
||||
जो
|
||||
तक
|
||||
तब
|
||||
तरह
|
||||
तिन
|
||||
तिन्हें
|
||||
तिन्हों
|
||||
तिस
|
||||
तिसे
|
||||
तो
|
||||
था
|
||||
थी
|
||||
थे
|
||||
दबारा
|
||||
दिया
|
||||
दुसरा
|
||||
दूसरे
|
||||
दो
|
||||
द्वारा
|
||||
न
|
||||
नके
|
||||
नहीं
|
||||
ना
|
||||
निहायत
|
||||
नीचे
|
||||
ने
|
||||
पर
|
||||
पहले
|
||||
पूरा
|
||||
पे
|
||||
फिर
|
||||
बनी
|
||||
बही
|
||||
बहुत
|
||||
बाद
|
||||
बाला
|
||||
बिलकुल
|
||||
भी
|
||||
भीतर
|
||||
मगर
|
||||
मानो
|
||||
मे
|
||||
में
|
||||
यदि
|
||||
यह
|
||||
यहाँ
|
||||
यही
|
||||
या
|
||||
यिह
|
||||
ये
|
||||
रखें
|
||||
रहा
|
||||
रहे
|
||||
ऱ्वासा
|
||||
लिए
|
||||
लिये
|
||||
लेकिन
|
||||
व
|
||||
वग़ैरह
|
||||
वर्ग
|
||||
वह
|
||||
वहाँ
|
||||
वहीं
|
||||
वाले
|
||||
वुह
|
||||
वे
|
||||
सकता
|
||||
सकते
|
||||
सबसे
|
||||
सभी
|
||||
साथ
|
||||
साबुत
|
||||
साभ
|
||||
सारा
|
||||
से
|
||||
सो
|
||||
संग
|
||||
ही
|
||||
हुआ
|
||||
हुई
|
||||
हुए
|
||||
है
|
||||
हैं
|
||||
हो
|
||||
होता
|
||||
होती
|
||||
होते
|
||||
होना
|
||||
होने
|
||||
""".split())
|
|
@ -9,7 +9,6 @@ from .lemmatizer import LOOKUP
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lemmatizerlookup import Lemmatizer
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
@ -18,17 +17,13 @@ class HungarianDefaults(Language.Defaults):
|
|||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'hu'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
prefixes = tuple(TOKENIZER_PREFIXES)
|
||||
suffixes = tuple(TOKENIZER_SUFFIXES)
|
||||
infixes = tuple(TOKENIZER_INFIXES)
|
||||
stop_words = STOP_WORDS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
token_match = TOKEN_MATCH
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return Lemmatizer(LOOKUP)
|
||||
lemma_lookup = LOOKUP
|
||||
|
||||
|
||||
class Hungarian(Language):
|
||||
|
|
17
spacy/lang/hu/examples.py
Normal file
17
spacy/lang/hu/examples.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.hu.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Az Apple egy brit startup vásárlását tervezi 1 milliárd dollár értékben.",
|
||||
"San Francisco vezetése mérlegeli a járdát használó szállító robotok betiltását.",
|
||||
"London az Egyesült Királyság egy nagy városa."
|
||||
]
|
36
spacy/lang/id/__init__.py
Normal file
36
spacy/lang/id/__init__.py
Normal file
|
@ -0,0 +1,36 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .norm_exceptions import NORM_EXCEPTIONS
|
||||
from .lemmatizer import LOOKUP
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
class IndonesianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'id'
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
lemma_lookup = LOOKUP
|
||||
|
||||
|
||||
class Indonesian(Language):
|
||||
lang = 'id'
|
||||
Defaults = IndonesianDefaults
|
||||
|
||||
|
||||
__all__ = ['Indonesian']
|
3833
spacy/lang/id/_tokenizer_exceptions_list.py
Normal file
3833
spacy/lang/id/_tokenizer_exceptions_list.py
Normal file
File diff suppressed because it is too large
Load Diff
22
spacy/lang/id/examples.py
Normal file
22
spacy/lang/id/examples.py
Normal file
|
@ -0,0 +1,22 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.en.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Al Qaidah mengklaim bom mobil yang menewaskan 60 Orang di Mali",
|
||||
"Abu Sayyaf mengeksekusi sandera warga Filipina",
|
||||
"Penyaluran pupuk berasal dari lima lokasi yakni Bontang, Kalimantan Timur, Surabaya, Banyuwangi, Semarang, dan Makassar.",
|
||||
"PT Pupuk Kaltim telah menyalurkan 274.707 ton pupuk bersubsidi ke wilayah penyaluran di 14 provinsi.",
|
||||
"Jakarta adalah kota besar yang nyaris tidak pernah tidur."
|
||||
"Kamu ada di mana semalam?",
|
||||
"Siapa yang membeli makanan ringan tersebut?",
|
||||
"Siapa presiden pertama Republik Indonesia?"
|
||||
]
|
36883
spacy/lang/id/lemmatizer.py
Normal file
36883
spacy/lang/id/lemmatizer.py
Normal file
File diff suppressed because it is too large
Load Diff
41
spacy/lang/id/lex_attrs.py
Normal file
41
spacy/lang/id/lex_attrs.py
Normal file
|
@ -0,0 +1,41 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...attrs import LIKE_NUM
|
||||
|
||||
|
||||
_num_words = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven',
|
||||
'eight', 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen',
|
||||
'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen', 'twenty',
|
||||
'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety',
|
||||
'hundred', 'thousand', 'million', 'billion', 'trillion', 'quadrillion',
|
||||
'gajillion', 'bazillion',
|
||||
'nol', 'satu', 'dua', 'tiga', 'empat', 'lima', 'enam', 'tujuh',
|
||||
'delapan', 'sembilan', 'sepuluh', 'sebelas', 'duabelas', 'tigabelas',
|
||||
'empatbelas', 'limabelas', 'enambelas', 'tujuhbelas', 'delapanbelas',
|
||||
'sembilanbelas', 'duapuluh', 'seratus', 'seribu', 'sejuta',
|
||||
'ribu', 'rb', 'juta', 'jt', 'miliar', 'biliun', 'triliun',
|
||||
'kuadriliun', 'kuintiliun', 'sekstiliun', 'septiliun', 'oktiliun',
|
||||
'noniliun', 'desiliun']
|
||||
|
||||
|
||||
def like_num(text):
|
||||
text = text.replace(',', '').replace('.', '')
|
||||
if text.isdigit():
|
||||
return True
|
||||
if text.count('/') == 1:
|
||||
num, denom = text.split('/')
|
||||
if num.isdigit() and denom.isdigit():
|
||||
return True
|
||||
if text in _num_words:
|
||||
return True
|
||||
if text.count('-') == 1:
|
||||
_, num = text.split('-')
|
||||
if num.isdigit() or num in _num_words:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
LEX_ATTRS = {
|
||||
LIKE_NUM: like_num
|
||||
}
|
17
spacy/lang/id/norm_exceptions.py
Normal file
17
spacy/lang/id/norm_exceptions.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
_exc = {
|
||||
"Rp": "$",
|
||||
"IDR": "$",
|
||||
"RMB": "$",
|
||||
"USD": "$",
|
||||
"AUD": "$",
|
||||
"GBP": "$",
|
||||
}
|
||||
|
||||
NORM_EXCEPTIONS = {}
|
||||
|
||||
for string, norm in _exc.items():
|
||||
NORM_EXCEPTIONS[string] = norm
|
||||
NORM_EXCEPTIONS[string.title()] = norm
|
53
spacy/lang/id/punctuation.py
Normal file
53
spacy/lang/id/punctuation.py
Normal file
|
@ -0,0 +1,53 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from ..char_classes import merge_chars, split_chars, _currency, _units
|
||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
|
||||
from ..char_classes import QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER, HYPHENS
|
||||
|
||||
_units = (_units + 's bit Gbps Mbps mbps Kbps kbps ƒ ppi px '
|
||||
'Hz kHz MHz GHz mAh '
|
||||
'ratus rb ribu ribuan '
|
||||
'juta jt jutaan mill?iar million bil[l]?iun bilyun billion '
|
||||
)
|
||||
_currency = (_currency + r' USD Rp IDR RMB SGD S\$')
|
||||
_months = ('Januari Februari Maret April Mei Juni Juli Agustus September '
|
||||
'Oktober November Desember January February March May June '
|
||||
'July August October December Jan Feb Mar Jun Jul Aug Sept '
|
||||
'Oct Okt Nov Des ')
|
||||
|
||||
|
||||
UNITS = merge_chars(_units)
|
||||
CURRENCY = merge_chars(_currency)
|
||||
HTML_PREFIX = r'<(b|strong|i|em|p|span|div|br)\s?/>|<a([^>]+)>'
|
||||
HTML_SUFFIX = r'</(b|strong|i|em|p|span|div|a)>'
|
||||
MONTHS = merge_chars(_months)
|
||||
LIST_CURRENCY = split_chars(_currency)
|
||||
|
||||
TOKENIZER_PREFIXES.remove('#') # hashtag
|
||||
_prefixes = TOKENIZER_PREFIXES + LIST_CURRENCY + [HTML_PREFIX] + ['/', '—']
|
||||
|
||||
_suffixes = TOKENIZER_SUFFIXES + [r'\-[Nn]ya', '-[KkMm]u', '[—-]'] + [
|
||||
r'(?<={c})(?:[0-9]+)'.format(c=CURRENCY),
|
||||
r'(?<=[0-9])(?:{u})'.format(u=UNITS),
|
||||
r'(?<=[0-9])%',
|
||||
r'(?<=[0-9{a}]{h})(?:[\.,:-])'.format(a=ALPHA, h=HTML_SUFFIX),
|
||||
r'(?<=[0-9{a}])(?:{h})'.format(a=ALPHA, h=HTML_SUFFIX),
|
||||
]
|
||||
|
||||
_infixes = TOKENIZER_INFIXES + [
|
||||
r'(?<=[0-9])[\\/](?=[0-9%-])',
|
||||
r'(?<=[0-9])%(?=[{a}0-9/])'.format(a=ALPHA),
|
||||
r'(?<={u})[\/-](?=[0-9])'.format(u=UNITS),
|
||||
r'(?<={m})[\/-](?=[0-9])'.format(m=MONTHS),
|
||||
r'(?<=[0-9\)][\.,])"(?=[0-9])',
|
||||
r'(?<=[{a}\)][\.,\'])["—](?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}])-(?=[0-9])'.format(a=ALPHA),
|
||||
r'(?<=[0-9])-(?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}])[\/-](?={c}{a})'.format(a=ALPHA, c=CURRENCY),
|
||||
]
|
||||
|
||||
TOKENIZER_PREFIXES = _prefixes
|
||||
TOKENIZER_SUFFIXES = _suffixes
|
||||
TOKENIZER_INFIXES = _infixes
|
763
spacy/lang/id/stop_words.py
Normal file
763
spacy/lang/id/stop_words.py
Normal file
|
@ -0,0 +1,763 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
STOP_WORDS = set("""
|
||||
ada
|
||||
adalah
|
||||
adanya
|
||||
adapun
|
||||
agak
|
||||
agaknya
|
||||
agar
|
||||
akan
|
||||
akankah
|
||||
akhir
|
||||
akhiri
|
||||
akhirnya
|
||||
aku
|
||||
akulah
|
||||
amat
|
||||
amatlah
|
||||
anda
|
||||
andalah
|
||||
antar
|
||||
antara
|
||||
antaranya
|
||||
apa
|
||||
apaan
|
||||
apabila
|
||||
apakah
|
||||
apalagi
|
||||
apatah
|
||||
artinya
|
||||
asal
|
||||
asalkan
|
||||
atas
|
||||
atau
|
||||
ataukah
|
||||
ataupun
|
||||
awal
|
||||
awalnya
|
||||
bagai
|
||||
bagaikan
|
||||
bagaimana
|
||||
bagaimanakah
|
||||
bagaimanapun
|
||||
bagi
|
||||
bagian
|
||||
bahkan
|
||||
bahwa
|
||||
bahwasanya
|
||||
baik
|
||||
bakal
|
||||
bakalan
|
||||
balik
|
||||
banyak
|
||||
bapak
|
||||
baru
|
||||
bawah
|
||||
beberapa
|
||||
begini
|
||||
beginian
|
||||
beginikah
|
||||
beginilah
|
||||
begitu
|
||||
begitukah
|
||||
begitulah
|
||||
begitupun
|
||||
bekerja
|
||||
belakang
|
||||
belakangan
|
||||
belum
|
||||
belumlah
|
||||
benar
|
||||
benarkah
|
||||
benarlah
|
||||
berada
|
||||
berakhir
|
||||
berakhirlah
|
||||
berakhirnya
|
||||
berapa
|
||||
berapakah
|
||||
berapalah
|
||||
berapapun
|
||||
berarti
|
||||
berawal
|
||||
berbagai
|
||||
berdatangan
|
||||
beri
|
||||
berikan
|
||||
berikut
|
||||
berikutnya
|
||||
berjumlah
|
||||
berkali-kali
|
||||
berkata
|
||||
berkehendak
|
||||
berkeinginan
|
||||
berkenaan
|
||||
berlainan
|
||||
berlalu
|
||||
berlangsung
|
||||
berlebihan
|
||||
bermacam
|
||||
bermacam-macam
|
||||
bermaksud
|
||||
bermula
|
||||
bersama
|
||||
bersama-sama
|
||||
bersiap
|
||||
bersiap-siap
|
||||
bertanya
|
||||
bertanya-tanya
|
||||
berturut
|
||||
berturut-turut
|
||||
bertutur
|
||||
berujar
|
||||
berupa
|
||||
besar
|
||||
betul
|
||||
betulkah
|
||||
biasa
|
||||
biasanya
|
||||
bila
|
||||
bilakah
|
||||
bisa
|
||||
bisakah
|
||||
boleh
|
||||
bolehkah
|
||||
bolehlah
|
||||
buat
|
||||
bukan
|
||||
bukankah
|
||||
bukanlah
|
||||
bukannya
|
||||
bulan
|
||||
bung
|
||||
cara
|
||||
caranya
|
||||
cukup
|
||||
cukupkah
|
||||
cukuplah
|
||||
cuma
|
||||
dahulu
|
||||
dalam
|
||||
dan
|
||||
dapat
|
||||
dari
|
||||
daripada
|
||||
datang
|
||||
dekat
|
||||
demi
|
||||
demikian
|
||||
demikianlah
|
||||
dengan
|
||||
depan
|
||||
di
|
||||
dia
|
||||
diakhiri
|
||||
diakhirinya
|
||||
dialah
|
||||
diantara
|
||||
diantaranya
|
||||
diberi
|
||||
diberikan
|
||||
diberikannya
|
||||
dibuat
|
||||
dibuatnya
|
||||
didapat
|
||||
didatangkan
|
||||
digunakan
|
||||
diibaratkan
|
||||
diibaratkannya
|
||||
diingat
|
||||
diingatkan
|
||||
diinginkan
|
||||
dijawab
|
||||
dijelaskan
|
||||
dijelaskannya
|
||||
dikarenakan
|
||||
dikatakan
|
||||
dikatakannya
|
||||
dikerjakan
|
||||
diketahui
|
||||
diketahuinya
|
||||
dikira
|
||||
dilakukan
|
||||
dilalui
|
||||
dilihat
|
||||
dimaksud
|
||||
dimaksudkan
|
||||
dimaksudkannya
|
||||
dimaksudnya
|
||||
diminta
|
||||
dimintai
|
||||
dimisalkan
|
||||
dimulai
|
||||
dimulailah
|
||||
dimulainya
|
||||
dimungkinkan
|
||||
dini
|
||||
dipastikan
|
||||
diperbuat
|
||||
diperbuatnya
|
||||
dipergunakan
|
||||
diperkirakan
|
||||
diperlihatkan
|
||||
diperlukan
|
||||
diperlukannya
|
||||
dipersoalkan
|
||||
dipertanyakan
|
||||
dipunyai
|
||||
diri
|
||||
dirinya
|
||||
disampaikan
|
||||
disebut
|
||||
disebutkan
|
||||
disebutkannya
|
||||
disini
|
||||
disinilah
|
||||
ditambahkan
|
||||
ditandaskan
|
||||
ditanya
|
||||
ditanyai
|
||||
ditanyakan
|
||||
ditegaskan
|
||||
ditujukan
|
||||
ditunjuk
|
||||
ditunjuki
|
||||
ditunjukkan
|
||||
ditunjukkannya
|
||||
ditunjuknya
|
||||
dituturkan
|
||||
dituturkannya
|
||||
diucapkan
|
||||
diucapkannya
|
||||
diungkapkan
|
||||
dong
|
||||
dua
|
||||
dulu
|
||||
empat
|
||||
enggak
|
||||
enggaknya
|
||||
entah
|
||||
entahlah
|
||||
guna
|
||||
gunakan
|
||||
hal
|
||||
hampir
|
||||
hanya
|
||||
hanyalah
|
||||
hari
|
||||
harus
|
||||
haruslah
|
||||
harusnya
|
||||
hendak
|
||||
hendaklah
|
||||
hendaknya
|
||||
hingga
|
||||
ia
|
||||
ialah
|
||||
ibarat
|
||||
ibaratkan
|
||||
ibaratnya
|
||||
ibu
|
||||
ikut
|
||||
ingat
|
||||
ingat-ingat
|
||||
ingin
|
||||
inginkah
|
||||
inginkan
|
||||
ini
|
||||
inikah
|
||||
inilah
|
||||
itu
|
||||
itukah
|
||||
itulah
|
||||
jadi
|
||||
jadilah
|
||||
jadinya
|
||||
jangan
|
||||
jangankan
|
||||
janganlah
|
||||
jauh
|
||||
jawab
|
||||
jawaban
|
||||
jawabnya
|
||||
jelas
|
||||
jelaskan
|
||||
jelaslah
|
||||
jelasnya
|
||||
jika
|
||||
jikalau
|
||||
juga
|
||||
jumlah
|
||||
jumlahnya
|
||||
justru
|
||||
kala
|
||||
kalau
|
||||
kalaulah
|
||||
kalaupun
|
||||
kalian
|
||||
kami
|
||||
kamilah
|
||||
kamu
|
||||
kamulah
|
||||
kan
|
||||
kapan
|
||||
kapankah
|
||||
kapanpun
|
||||
karena
|
||||
karenanya
|
||||
kasus
|
||||
kata
|
||||
katakan
|
||||
katakanlah
|
||||
katanya
|
||||
ke
|
||||
keadaan
|
||||
kebetulan
|
||||
kecil
|
||||
kedua
|
||||
keduanya
|
||||
keinginan
|
||||
kelamaan
|
||||
kelihatan
|
||||
kelihatannya
|
||||
kelima
|
||||
keluar
|
||||
kembali
|
||||
kemudian
|
||||
kemungkinan
|
||||
kemungkinannya
|
||||
kenapa
|
||||
kepada
|
||||
kepadanya
|
||||
kesampaian
|
||||
keseluruhan
|
||||
keseluruhannya
|
||||
keterlaluan
|
||||
ketika
|
||||
khususnya
|
||||
kini
|
||||
kinilah
|
||||
kira
|
||||
kira-kira
|
||||
kiranya
|
||||
kita
|
||||
kitalah
|
||||
kok
|
||||
kurang
|
||||
lagi
|
||||
lagian
|
||||
lah
|
||||
lain
|
||||
lainnya
|
||||
lalu
|
||||
lama
|
||||
lamanya
|
||||
lanjut
|
||||
lanjutnya
|
||||
lebih
|
||||
lewat
|
||||
lima
|
||||
luar
|
||||
macam
|
||||
maka
|
||||
makanya
|
||||
makin
|
||||
malah
|
||||
malahan
|
||||
mampu
|
||||
mampukah
|
||||
mana
|
||||
manakala
|
||||
manalagi
|
||||
masa
|
||||
masalah
|
||||
masalahnya
|
||||
masih
|
||||
masihkah
|
||||
masing
|
||||
masing-masing
|
||||
mau
|
||||
maupun
|
||||
melainkan
|
||||
melakukan
|
||||
melalui
|
||||
melihat
|
||||
melihatnya
|
||||
memang
|
||||
memastikan
|
||||
memberi
|
||||
memberikan
|
||||
membuat
|
||||
memerlukan
|
||||
memihak
|
||||
meminta
|
||||
memintakan
|
||||
memisalkan
|
||||
memperbuat
|
||||
mempergunakan
|
||||
memperkirakan
|
||||
memperlihatkan
|
||||
mempersiapkan
|
||||
mempersoalkan
|
||||
mempertanyakan
|
||||
mempunyai
|
||||
memulai
|
||||
memungkinkan
|
||||
menaiki
|
||||
menambahkan
|
||||
menandaskan
|
||||
menanti
|
||||
menanti-nanti
|
||||
menantikan
|
||||
menanya
|
||||
menanyai
|
||||
menanyakan
|
||||
mendapat
|
||||
mendapatkan
|
||||
mendatang
|
||||
mendatangi
|
||||
mendatangkan
|
||||
menegaskan
|
||||
mengakhiri
|
||||
mengapa
|
||||
mengatakan
|
||||
mengatakannya
|
||||
mengenai
|
||||
mengerjakan
|
||||
mengetahui
|
||||
menggunakan
|
||||
menghendaki
|
||||
mengibaratkan
|
||||
mengibaratkannya
|
||||
mengingat
|
||||
mengingatkan
|
||||
menginginkan
|
||||
mengira
|
||||
mengucapkan
|
||||
mengucapkannya
|
||||
mengungkapkan
|
||||
menjadi
|
||||
menjawab
|
||||
menjelaskan
|
||||
menuju
|
||||
menunjuk
|
||||
menunjuki
|
||||
menunjukkan
|
||||
menunjuknya
|
||||
menurut
|
||||
menuturkan
|
||||
menyampaikan
|
||||
menyangkut
|
||||
menyatakan
|
||||
menyebutkan
|
||||
menyeluruh
|
||||
menyiapkan
|
||||
merasa
|
||||
mereka
|
||||
merekalah
|
||||
merupakan
|
||||
meski
|
||||
meskipun
|
||||
meyakini
|
||||
meyakinkan
|
||||
minta
|
||||
mirip
|
||||
misal
|
||||
misalkan
|
||||
misalnya
|
||||
mula
|
||||
mulai
|
||||
mulailah
|
||||
mulanya
|
||||
mungkin
|
||||
mungkinkah
|
||||
nah
|
||||
naik
|
||||
namun
|
||||
nanti
|
||||
nantinya
|
||||
nyaris
|
||||
nyatanya
|
||||
oleh
|
||||
olehnya
|
||||
pada
|
||||
padahal
|
||||
padanya
|
||||
pak
|
||||
paling
|
||||
panjang
|
||||
pantas
|
||||
para
|
||||
pasti
|
||||
pastilah
|
||||
penting
|
||||
pentingnya
|
||||
per
|
||||
percuma
|
||||
perlu
|
||||
perlukah
|
||||
perlunya
|
||||
pernah
|
||||
persoalan
|
||||
pertama
|
||||
pertama-tama
|
||||
pertanyaan
|
||||
pertanyakan
|
||||
pihak
|
||||
pihaknya
|
||||
pukul
|
||||
pula
|
||||
pun
|
||||
punya
|
||||
rasa
|
||||
rasanya
|
||||
rata
|
||||
rupanya
|
||||
saat
|
||||
saatnya
|
||||
saja
|
||||
sajalah
|
||||
saling
|
||||
sama
|
||||
sama-sama
|
||||
sambil
|
||||
sampai
|
||||
sampai-sampai
|
||||
sampaikan
|
||||
sana
|
||||
sangat
|
||||
sangatlah
|
||||
satu
|
||||
saya
|
||||
sayalah
|
||||
se
|
||||
sebab
|
||||
sebabnya
|
||||
sebagai
|
||||
sebagaimana
|
||||
sebagainya
|
||||
sebagian
|
||||
sebaik
|
||||
sebaik-baiknya
|
||||
sebaiknya
|
||||
sebaliknya
|
||||
sebanyak
|
||||
sebegini
|
||||
sebegitu
|
||||
sebelum
|
||||
sebelumnya
|
||||
sebenarnya
|
||||
seberapa
|
||||
sebesar
|
||||
sebetulnya
|
||||
sebisanya
|
||||
sebuah
|
||||
sebut
|
||||
sebutlah
|
||||
sebutnya
|
||||
secara
|
||||
secukupnya
|
||||
sedang
|
||||
sedangkan
|
||||
sedemikian
|
||||
sedikit
|
||||
sedikitnya
|
||||
seenaknya
|
||||
segala
|
||||
segalanya
|
||||
segera
|
||||
seharusnya
|
||||
sehingga
|
||||
seingat
|
||||
sejak
|
||||
sejauh
|
||||
sejenak
|
||||
sejumlah
|
||||
sekadar
|
||||
sekadarnya
|
||||
sekali
|
||||
sekali-kali
|
||||
sekalian
|
||||
sekaligus
|
||||
sekalipun
|
||||
sekarang
|
||||
sekarang
|
||||
sekecil
|
||||
seketika
|
||||
sekiranya
|
||||
sekitar
|
||||
sekitarnya
|
||||
sekurang-kurangnya
|
||||
sekurangnya
|
||||
sela
|
||||
selain
|
||||
selaku
|
||||
selalu
|
||||
selama
|
||||
selama-lamanya
|
||||
selamanya
|
||||
selanjutnya
|
||||
seluruh
|
||||
seluruhnya
|
||||
semacam
|
||||
semakin
|
||||
semampu
|
||||
semampunya
|
||||
semasa
|
||||
semasih
|
||||
semata
|
||||
semata-mata
|
||||
semaunya
|
||||
sementara
|
||||
semisal
|
||||
semisalnya
|
||||
sempat
|
||||
semua
|
||||
semuanya
|
||||
semula
|
||||
sendiri
|
||||
sendirian
|
||||
sendirinya
|
||||
seolah
|
||||
seolah-olah
|
||||
seorang
|
||||
sepanjang
|
||||
sepantasnya
|
||||
sepantasnyalah
|
||||
seperlunya
|
||||
seperti
|
||||
sepertinya
|
||||
sepihak
|
||||
sering
|
||||
seringnya
|
||||
serta
|
||||
serupa
|
||||
sesaat
|
||||
sesama
|
||||
sesampai
|
||||
sesegera
|
||||
sesekali
|
||||
seseorang
|
||||
sesuatu
|
||||
sesuatunya
|
||||
sesudah
|
||||
sesudahnya
|
||||
setelah
|
||||
setempat
|
||||
setengah
|
||||
seterusnya
|
||||
setiap
|
||||
setiba
|
||||
setibanya
|
||||
setidak-tidaknya
|
||||
setidaknya
|
||||
setinggi
|
||||
seusai
|
||||
sewaktu
|
||||
siap
|
||||
siapa
|
||||
siapakah
|
||||
siapapun
|
||||
sini
|
||||
sinilah
|
||||
soal
|
||||
soalnya
|
||||
suatu
|
||||
sudah
|
||||
sudahkah
|
||||
sudahlah
|
||||
supaya
|
||||
tadi
|
||||
tadinya
|
||||
tahu
|
||||
tahun
|
||||
tak
|
||||
tambah
|
||||
tambahnya
|
||||
tampak
|
||||
tampaknya
|
||||
tandas
|
||||
tandasnya
|
||||
tanpa
|
||||
tanya
|
||||
tanyakan
|
||||
tanyanya
|
||||
tapi
|
||||
tegas
|
||||
tegasnya
|
||||
telah
|
||||
tempat
|
||||
tengah
|
||||
tentang
|
||||
tentu
|
||||
tentulah
|
||||
tentunya
|
||||
tepat
|
||||
terakhir
|
||||
terasa
|
||||
terbanyak
|
||||
terdahulu
|
||||
terdapat
|
||||
terdiri
|
||||
terhadap
|
||||
terhadapnya
|
||||
teringat
|
||||
teringat-ingat
|
||||
terjadi
|
||||
terjadilah
|
||||
terjadinya
|
||||
terkira
|
||||
terlalu
|
||||
terlebih
|
||||
terlihat
|
||||
termasuk
|
||||
ternyata
|
||||
tersampaikan
|
||||
tersebut
|
||||
tersebutlah
|
||||
tertentu
|
||||
tertuju
|
||||
terus
|
||||
terutama
|
||||
tetap
|
||||
tetapi
|
||||
tiap
|
||||
tiba
|
||||
tiba-tiba
|
||||
tidak
|
||||
tidakkah
|
||||
tidaklah
|
||||
tiga
|
||||
tinggi
|
||||
toh
|
||||
tunjuk
|
||||
turut
|
||||
tutur
|
||||
tuturnya
|
||||
ucap
|
||||
ucapnya
|
||||
ujar
|
||||
ujarnya
|
||||
umum
|
||||
umumnya
|
||||
ungkap
|
||||
ungkapnya
|
||||
untuk
|
||||
usah
|
||||
usai
|
||||
waduh
|
||||
wah
|
||||
wahai
|
||||
waktu
|
||||
waktunya
|
||||
walau
|
||||
walaupun
|
||||
wong
|
||||
yaitu
|
||||
yakin
|
||||
yakni
|
||||
yang
|
||||
""".split())
|
42
spacy/lang/id/syntax_iterators.py
Normal file
42
spacy/lang/id/syntax_iterators.py
Normal file
|
@ -0,0 +1,42 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import NOUN, PROPN, PRON
|
||||
|
||||
|
||||
def noun_chunks(obj):
|
||||
"""
|
||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||
"""
|
||||
labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
|
||||
doc = obj.doc # Ensure works on both Doc and Span.
|
||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||
conj = doc.vocab.strings.add('conj')
|
||||
np_label = doc.vocab.strings.add('NP')
|
||||
seen = set()
|
||||
for i, word in enumerate(obj):
|
||||
if word.pos not in (NOUN, PROPN, PRON):
|
||||
continue
|
||||
# Prevent nested chunks from being produced
|
||||
if word.i in seen:
|
||||
continue
|
||||
if word.dep in np_deps:
|
||||
if any(w.i in seen for w in word.subtree):
|
||||
continue
|
||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
|
||||
yield word.left_edge.i, word.right_edge.i+1, np_label
|
||||
elif word.dep == conj:
|
||||
head = word.head
|
||||
while head.dep == conj and head.head.i < head.i:
|
||||
head = head.head
|
||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||
if head.dep in np_deps:
|
||||
if any(w.i in seen for w in word.subtree):
|
||||
continue
|
||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
|
||||
yield word.left_edge.i, word.right_edge.i+1, np_label
|
||||
|
||||
|
||||
SYNTAX_ITERATORS = {
|
||||
'noun_chunks': noun_chunks
|
||||
}
|
50
spacy/lang/id/tokenizer_exceptions.py
Normal file
50
spacy/lang/id/tokenizer_exceptions.py
Normal file
|
@ -0,0 +1,50 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import regex as re
|
||||
|
||||
from ._tokenizer_exceptions_list import ID_BASE_EXCEPTIONS
|
||||
from ..tokenizer_exceptions import URL_PATTERN
|
||||
from ...symbols import ORTH
|
||||
|
||||
|
||||
_exc = {}
|
||||
|
||||
for orth in ID_BASE_EXCEPTIONS:
|
||||
_exc[orth] = [{ORTH: orth}]
|
||||
|
||||
orth_title = orth.title()
|
||||
_exc[orth_title] = [{ORTH: orth_title}]
|
||||
|
||||
orth_caps = orth.upper()
|
||||
_exc[orth_caps] = [{ORTH: orth_caps}]
|
||||
|
||||
orth_lower = orth.lower()
|
||||
_exc[orth_lower] = [{ORTH: orth_lower}]
|
||||
|
||||
if '-' in orth:
|
||||
orth_title = '-'.join([part.title() for part in orth.split('-')])
|
||||
_exc[orth_title] = [{ORTH: orth_title}]
|
||||
|
||||
orth_caps = '-'.join([part.upper() for part in orth.split('-')])
|
||||
_exc[orth_caps] = [{ORTH: orth_caps}]
|
||||
|
||||
|
||||
for orth in [
|
||||
"'d", "a.m.", "Adm.", "Bros.", "co.", "Co.", "Corp.", "D.C.", "Dr.", "e.g.",
|
||||
"E.g.", "E.G.", "Gen.", "Gov.", "i.e.", "I.e.", "I.E.", "Inc.", "Jr.",
|
||||
"Ltd.", "Md.", "Messrs.", "Mo.", "Mont.", "Mr.", "Mrs.", "Ms.", "p.m.",
|
||||
"Ph.D.", "Rep.", "Rev.", "Sen.", "St.", "vs.",
|
||||
"B.A.", "B.Ch.E.", "B.Sc.", "Dr.", "Dra.", "Drs.", "Hj.", "Ka.", "Kp.",
|
||||
"M.Ag.", "M.Hum.", "M.Kes,", "M.Kom.", "M.M.", "M.P.", "M.Pd.", "M.Sc.",
|
||||
"M.Si.", "M.Sn.", "M.T.", "M.Th.", "No.", "Pjs.", "Plt.", "R.A.", "S.Ag.",
|
||||
"S.E.", "S.H.", "S.Hut.", "S.K.M.", "S.Kedg.", "S.Kedh.", "S.Kom.",
|
||||
"S.Pd.", "S.Pol.", "S.Psi.", "S.S.", "S.Sos.", "S.T.", "S.Tekp.", "S.Th.",
|
||||
"a.l.", "a.n.", "a.s.", "b.d.", "d.a.", "d.l.", "d/h", "dkk.", "dll.",
|
||||
"dr.", "drh.", "ds.", "dsb.", "dst.", "faks.", "fax.", "hlm.", "i/o",
|
||||
"n.b.", "p.p." "pjs.", "s.d.", "tel.", "u.p.",
|
||||
]:
|
||||
_exc[orth] = [{ORTH: orth}]
|
||||
|
||||
TOKENIZER_EXCEPTIONS = dict(_exc)
|
||||
|
|
@ -7,7 +7,6 @@ from .lemmatizer import LOOKUP
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lemmatizerlookup import Lemmatizer
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
@ -16,13 +15,9 @@ class ItalianDefaults(Language.Defaults):
|
|||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'it'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return Lemmatizer(LOOKUP)
|
||||
stop_words = STOP_WORDS
|
||||
lemma_lookup = LOOKUP
|
||||
|
||||
|
||||
class Italian(Language):
|
||||
|
|
18
spacy/lang/it/examples.py
Normal file
18
spacy/lang/it/examples.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.it.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple vuole comprare una startup del Regno Unito per un miliardo di dollari",
|
||||
"Le automobili a guida autonoma spostano la responsabilità assicurativa verso i produttori",
|
||||
"San Francisco prevede di bandire i robot di consegna porta a porta",
|
||||
"Londra è una grande città del Regno Unito."
|
||||
]
|
|
@ -4,18 +4,36 @@ from __future__ import unicode_literals, print_function
|
|||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...tokens import Doc
|
||||
from ...tokenizer import Tokenizer
|
||||
|
||||
|
||||
class JapaneseTokenizer(object):
|
||||
def __init__(self, cls, nlp=None):
|
||||
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
||||
try:
|
||||
from janome.tokenizer import Tokenizer
|
||||
except ImportError:
|
||||
raise ImportError("The Japanese tokenizer requires the Janome "
|
||||
"library: https://github.com/mocobeta/janome")
|
||||
self.tokenizer = Tokenizer()
|
||||
|
||||
def __call__(self, text):
|
||||
words = [x.surface for x in self.tokenizer.tokenize(text)]
|
||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||
|
||||
|
||||
class JapaneseDefaults(Language.Defaults):
|
||||
@classmethod
|
||||
def create_tokenizer(cls, nlp=None):
|
||||
return JapaneseTokenizer(cls, nlp)
|
||||
|
||||
|
||||
class Japanese(Language):
|
||||
lang = 'ja'
|
||||
Defaults = JapaneseDefaults
|
||||
|
||||
def make_doc(self, text):
|
||||
try:
|
||||
from janome.tokenizer import Tokenizer
|
||||
except ImportError:
|
||||
raise ImportError("The Japanese tokenizer requires the Janome library: "
|
||||
"https://github.com/mocobeta/janome")
|
||||
words = [x.surface for x in Tokenizer().tokenize(text)]
|
||||
words = self.tokenizer(text)
|
||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||
|
||||
|
||||
|
|
|
@ -122,21 +122,35 @@ def word_shape(text):
|
|||
shape.append(shape_char)
|
||||
return ''.join(shape)
|
||||
|
||||
def lower(string): return string.lower()
|
||||
def prefix(string): return string[0]
|
||||
def suffix(string): return string[-3:]
|
||||
def cluster(string): return 0
|
||||
def is_alpha(string): return string.isalpha()
|
||||
def is_digit(string): return string.isdigit()
|
||||
def is_lower(string): return string.islower()
|
||||
def is_space(string): return string.isspace()
|
||||
def is_title(string): return string.istitle()
|
||||
def is_upper(string): return string.isupper()
|
||||
def is_stop(string, stops=set()): return string in stops
|
||||
def is_oov(string): return True
|
||||
def get_prob(string): return -20.
|
||||
|
||||
LEX_ATTRS = {
|
||||
attrs.LOWER: lambda string: string.lower(),
|
||||
attrs.NORM: lambda string: string.lower(),
|
||||
attrs.PREFIX: lambda string: string[0],
|
||||
attrs.SUFFIX: lambda string: string[-3:],
|
||||
attrs.CLUSTER: lambda string: 0,
|
||||
attrs.IS_ALPHA: lambda string: string.isalpha(),
|
||||
attrs.IS_DIGIT: lambda string: string.isdigit(),
|
||||
attrs.IS_LOWER: lambda string: string.islower(),
|
||||
attrs.IS_SPACE: lambda string: string.isspace(),
|
||||
attrs.IS_TITLE: lambda string: string.istitle(),
|
||||
attrs.IS_UPPER: lambda string: string.isupper(),
|
||||
attrs.IS_STOP: lambda string: False,
|
||||
attrs.IS_OOV: lambda string: True,
|
||||
attrs.LOWER: lower,
|
||||
attrs.NORM: lower,
|
||||
attrs.PREFIX: prefix,
|
||||
attrs.SUFFIX: suffix,
|
||||
attrs.CLUSTER: cluster,
|
||||
attrs.IS_ALPHA: is_alpha,
|
||||
attrs.IS_DIGIT: is_digit,
|
||||
attrs.IS_LOWER: is_lower,
|
||||
attrs.IS_SPACE: is_space,
|
||||
attrs.IS_TITLE: is_title,
|
||||
attrs.IS_UPPER: is_upper,
|
||||
attrs.IS_STOP: is_stop,
|
||||
attrs.IS_OOV: is_oov,
|
||||
attrs.PROB: get_prob,
|
||||
attrs.LIKE_EMAIL: like_email,
|
||||
attrs.LIKE_NUM: like_num,
|
||||
attrs.IS_PUNCT: is_punct,
|
||||
|
|
|
@ -16,9 +16,8 @@ class NorwegianDefaults(Language.Defaults):
|
|||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'nb'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Norwegian(Language):
|
||||
|
|
18
spacy/lang/nb/examples.py
Normal file
18
spacy/lang/nb/examples.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.nb.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple vurderer å kjøpe britisk oppstartfirma for en milliard dollar",
|
||||
"Selvkjørende biler flytter forsikringsansvaret over på produsentene ",
|
||||
"San Francisco vurderer å forby robotbud på fortauene",
|
||||
"London er en stor by i Storbritannia."
|
||||
]
|
|
@ -2,6 +2,7 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
|
@ -12,11 +13,11 @@ from ...util import update_exc, add_lookups
|
|||
|
||||
class DutchDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: 'nl'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Dutch(Language):
|
||||
|
|
40
spacy/lang/nl/lex_attrs.py
Normal file
40
spacy/lang/nl/lex_attrs.py
Normal file
|
@ -0,0 +1,40 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...attrs import LIKE_NUM
|
||||
|
||||
|
||||
_num_words = set("""
|
||||
nul een één twee drie vier vijf zes zeven acht negen tien elf twaalf dertien
|
||||
veertien twintig dertig veertig vijftig zestig zeventig tachtig negentig honderd
|
||||
duizend miljoen miljard biljoen biljard triljoen triljard
|
||||
""".split())
|
||||
|
||||
_ordinal_words = set("""
|
||||
eerste tweede derde vierde vijfde zesde zevende achtste negende tiende elfde
|
||||
twaalfde dertiende veertiende twintigste dertigste veertigste vijftigste
|
||||
zestigste zeventigste tachtigste negentigste honderdste duizendste miljoenste
|
||||
miljardste biljoenste biljardste triljoenste triljardste
|
||||
""".split())
|
||||
|
||||
|
||||
def like_num(text):
|
||||
# This only does the most basic check for whether a token is a digit
|
||||
# or matches one of the number words. In order to handle numbers like
|
||||
# "drieëntwintig", more work is required.
|
||||
# See this discussion: https://github.com/explosion/spaCy/pull/1177
|
||||
text = text.replace(',', '').replace('.', '')
|
||||
if text.isdigit():
|
||||
return True
|
||||
if text.count('/') == 1:
|
||||
num, denom = text.split('/')
|
||||
if num.isdigit() and denom.isdigit():
|
||||
return True
|
||||
if text in _num_words:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
LEX_ATTRS = {
|
||||
LIKE_NUM: like_num
|
||||
}
|
|
@ -31,11 +31,21 @@ BASE_NORMS = {
|
|||
"„": '"',
|
||||
"»": '"',
|
||||
"«": '"',
|
||||
"‘‘": '"',
|
||||
"’’": '"',
|
||||
"?": "?",
|
||||
"!": "!",
|
||||
",": ",",
|
||||
";": ";",
|
||||
":": ":",
|
||||
"。": ".",
|
||||
"।": ".",
|
||||
"…": "...",
|
||||
"—": "-",
|
||||
"–": "-",
|
||||
"--": "-",
|
||||
"---": "-",
|
||||
"——": "-",
|
||||
"€": "$",
|
||||
"£": "$",
|
||||
"¥": "$",
|
||||
|
|
|
@ -15,9 +15,8 @@ class PolishDefaults(Language.Defaults):
|
|||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'pl'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Polish(Language):
|
||||
|
|
20
spacy/lang/pl/examples.py
Normal file
20
spacy/lang/pl/examples.py
Normal file
|
@ -0,0 +1,20 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.pl.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Poczuł przyjemną woń mocnej kawy.",
|
||||
"Istnieje wiele dróg oddziaływania substancji psychoaktywnej na układ nerwowy.",
|
||||
"Powitał mnie biało-czarny kot, płosząc siedzące na płocie trzy dorodne dudki.",
|
||||
"Nowy abonament pod lupą Komisji Europejskiej",
|
||||
"Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?",
|
||||
"Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”."
|
||||
]
|
|
@ -9,7 +9,6 @@ from .lemmatizer import LOOKUP
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lemmatizerlookup import Lemmatizer
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
@ -19,13 +18,9 @@ class PortugueseDefaults(Language.Defaults):
|
|||
lex_attr_getters[LANG] = lambda text: 'pt'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return Lemmatizer(LOOKUP)
|
||||
stop_words = STOP_WORDS
|
||||
lemma_lookup = LOOKUP
|
||||
|
||||
|
||||
class Portuguese(Language):
|
||||
|
|
18
spacy/lang/pt/examples.py
Normal file
18
spacy/lang/pt/examples.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.pt.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares",
|
||||
"Carros autônomos empurram a responsabilidade do seguro para os fabricantes."
|
||||
"São Francisco considera banir os robôs de entrega que andam pelas calçadas",
|
||||
"Londres é a maior cidade do Reino Unido"
|
||||
]
|
|
@ -9,7 +9,6 @@ from .lemmatizer import LEMMA_RULES, LOOKUP
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lemmatizerlookup import Lemmatizer
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
@ -18,13 +17,10 @@ class SwedishDefaults(Language.Defaults):
|
|||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'sv'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return Lemmatizer(LOOKUP)
|
||||
stop_words = STOP_WORDS
|
||||
lemma_rules = LEMMA_RULES
|
||||
lemma_lookup = LOOKUP
|
||||
|
||||
|
||||
class Swedish(Language):
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user