mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Wrap try/except around model saving
This commit is contained in:
commit
c6cd81f192
|
@ -1 +1,55 @@
|
||||||
|
environment:
|
||||||
|
|
||||||
|
matrix:
|
||||||
|
|
||||||
|
# For Python versions available on Appveyor, see
|
||||||
|
# http://www.appveyor.com/docs/installed-software#python
|
||||||
|
# The list here is complete (excluding Python 2.6, which
|
||||||
|
# isn't covered by this document) at the time of writing.
|
||||||
|
|
||||||
|
- PYTHON: "C:\\Python27"
|
||||||
|
#- PYTHON: "C:\\Python33"
|
||||||
|
#- PYTHON: "C:\\Python34"
|
||||||
|
#- PYTHON: "C:\\Python35"
|
||||||
|
#- PYTHON: "C:\\Python27-x64"
|
||||||
|
#- PYTHON: "C:\\Python33-x64"
|
||||||
|
#- DISTUTILS_USE_SDK: "1"
|
||||||
|
#- PYTHON: "C:\\Python34-x64"
|
||||||
|
#- DISTUTILS_USE_SDK: "1"
|
||||||
|
#- PYTHON: "C:\\Python35-x64"
|
||||||
|
- PYTHON: "C:\\Python36-x64"
|
||||||
|
|
||||||
|
install:
|
||||||
|
# We need wheel installed to build wheels
|
||||||
|
- "%PYTHON%\\python.exe -m pip install wheel"
|
||||||
|
- "%PYTHON%\\python.exe -m pip install cython"
|
||||||
|
- "%PYTHON%\\python.exe -m pip install -r requirements.txt"
|
||||||
|
- "%PYTHON%\\python.exe -m pip install -e ."
|
||||||
|
|
||||||
build: off
|
build: off
|
||||||
|
|
||||||
|
test_script:
|
||||||
|
# Put your test command here.
|
||||||
|
# If you don't need to build C extensions on 64-bit Python 3.3 or 3.4,
|
||||||
|
# you can remove "build.cmd" from the front of the command, as it's
|
||||||
|
# only needed to support those cases.
|
||||||
|
# Note that you must use the environment variable %PYTHON% to refer to
|
||||||
|
# the interpreter you're using - Appveyor does not do anything special
|
||||||
|
# to put the Python version you want to use on PATH.
|
||||||
|
- "%PYTHON%\\python.exe -m pytest spacy/"
|
||||||
|
|
||||||
|
after_test:
|
||||||
|
# This step builds your wheels.
|
||||||
|
# Again, you only need build.cmd if you're building C extensions for
|
||||||
|
# 64-bit Python 3.3/3.4. And you need to use %PYTHON% to get the correct
|
||||||
|
# interpreter
|
||||||
|
- "%PYTHON%\\python.exe setup.py bdist_wheel"
|
||||||
|
|
||||||
|
artifacts:
|
||||||
|
# bdist_wheel puts your built wheel in the dist directory
|
||||||
|
- path: dist\*
|
||||||
|
|
||||||
|
#on_success:
|
||||||
|
# You can use this step to upload your artifacts to a public website.
|
||||||
|
# See Appveyor's documentation for more details. Or you can simply
|
||||||
|
# access your wheels from the Appveyor "artifacts" tab for your build.
|
||||||
|
|
11
.buildkite/sdist.yml
Normal file
11
.buildkite/sdist.yml
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
steps:
|
||||||
|
-
|
||||||
|
command: "fab env clean make test sdist"
|
||||||
|
label: ":dizzy: :python:"
|
||||||
|
artifact_paths: "dist/*.tar.gz"
|
||||||
|
- wait
|
||||||
|
- trigger: "spacy-sdist-against-models"
|
||||||
|
label: ":dizzy: :hammer:"
|
||||||
|
build:
|
||||||
|
env:
|
||||||
|
SPACY_VERSION: "{$SPACY_VERSION}"
|
4
.gitignore
vendored
4
.gitignore
vendored
|
@ -1,14 +1,12 @@
|
||||||
# spaCy
|
# spaCy
|
||||||
spacy/data/
|
spacy/data/
|
||||||
corpora/
|
corpora/
|
||||||
models/
|
/models/
|
||||||
keys/
|
keys/
|
||||||
|
|
||||||
# Website
|
# Website
|
||||||
website/www/
|
website/www/
|
||||||
website/_deploy.sh
|
website/_deploy.sh
|
||||||
website/package.json
|
|
||||||
website/announcement.jade
|
|
||||||
website/.gitignore
|
website/.gitignore
|
||||||
|
|
||||||
# Cython / C extensions
|
# Cython / C extensions
|
||||||
|
|
|
@ -1,322 +0,0 @@
|
||||||
'''WIP --- Doesn't work well yet'''
|
|
||||||
import plac
|
|
||||||
import random
|
|
||||||
import six
|
|
||||||
|
|
||||||
import cProfile
|
|
||||||
import pstats
|
|
||||||
|
|
||||||
import pathlib
|
|
||||||
import cPickle as pickle
|
|
||||||
from itertools import izip
|
|
||||||
|
|
||||||
import spacy
|
|
||||||
|
|
||||||
import cytoolz
|
|
||||||
import cupy as xp
|
|
||||||
import cupy.cuda
|
|
||||||
import chainer.cuda
|
|
||||||
|
|
||||||
import chainer.links as L
|
|
||||||
import chainer.functions as F
|
|
||||||
from chainer import Chain, Variable, report
|
|
||||||
import chainer.training
|
|
||||||
import chainer.optimizers
|
|
||||||
from chainer.training import extensions
|
|
||||||
from chainer.iterators import SerialIterator
|
|
||||||
from chainer.datasets import TupleDataset
|
|
||||||
|
|
||||||
|
|
||||||
class SentimentAnalyser(object):
|
|
||||||
@classmethod
|
|
||||||
def load(cls, path, nlp, max_length=100):
|
|
||||||
raise NotImplementedError
|
|
||||||
#with (path / 'config.json').open() as file_:
|
|
||||||
# model = model_from_json(file_.read())
|
|
||||||
#with (path / 'model').open('rb') as file_:
|
|
||||||
# lstm_weights = pickle.load(file_)
|
|
||||||
#embeddings = get_embeddings(nlp.vocab)
|
|
||||||
#model.set_weights([embeddings] + lstm_weights)
|
|
||||||
#return cls(model, max_length=max_length)
|
|
||||||
|
|
||||||
def __init__(self, model, max_length=100):
|
|
||||||
self._model = model
|
|
||||||
self.max_length = max_length
|
|
||||||
|
|
||||||
def __call__(self, doc):
|
|
||||||
X = get_features([doc], self.max_length)
|
|
||||||
y = self._model.predict(X)
|
|
||||||
self.set_sentiment(doc, y)
|
|
||||||
|
|
||||||
def pipe(self, docs, batch_size=1000, n_threads=2):
|
|
||||||
for minibatch in cytoolz.partition_all(batch_size, docs):
|
|
||||||
minibatch = list(minibatch)
|
|
||||||
sentences = []
|
|
||||||
for doc in minibatch:
|
|
||||||
sentences.extend(doc.sents)
|
|
||||||
Xs = get_features(sentences, self.max_length)
|
|
||||||
ys = self._model.predict(Xs)
|
|
||||||
for sent, label in zip(sentences, ys):
|
|
||||||
sent.doc.sentiment += label - 0.5
|
|
||||||
for doc in minibatch:
|
|
||||||
yield doc
|
|
||||||
|
|
||||||
def set_sentiment(self, doc, y):
|
|
||||||
doc.sentiment = float(y[0])
|
|
||||||
# Sentiment has a native slot for a single float.
|
|
||||||
# For arbitrary data storage, there's:
|
|
||||||
# doc.user_data['my_data'] = y
|
|
||||||
|
|
||||||
|
|
||||||
class Classifier(Chain):
|
|
||||||
def __init__(self, predictor):
|
|
||||||
super(Classifier, self).__init__(predictor=predictor)
|
|
||||||
|
|
||||||
def __call__(self, x, t):
|
|
||||||
y = self.predictor(x)
|
|
||||||
loss = F.softmax_cross_entropy(y, t)
|
|
||||||
accuracy = F.accuracy(y, t)
|
|
||||||
report({'loss': loss, 'accuracy': accuracy}, self)
|
|
||||||
return loss
|
|
||||||
|
|
||||||
|
|
||||||
class SentimentModel(Chain):
|
|
||||||
def __init__(self, nlp, shape, **settings):
|
|
||||||
Chain.__init__(self,
|
|
||||||
embed=_Embed(shape['nr_vector'], shape['nr_dim'], shape['nr_hidden'],
|
|
||||||
set_vectors=lambda arr: set_vectors(arr, nlp.vocab)),
|
|
||||||
encode=_Encode(shape['nr_hidden'], shape['nr_hidden']),
|
|
||||||
attend=_Attend(shape['nr_hidden'], shape['nr_hidden']),
|
|
||||||
predict=_Predict(shape['nr_hidden'], shape['nr_class']))
|
|
||||||
self.to_gpu(0)
|
|
||||||
|
|
||||||
def __call__(self, sentence):
|
|
||||||
return self.predict(
|
|
||||||
self.attend(
|
|
||||||
self.encode(
|
|
||||||
self.embed(sentence))))
|
|
||||||
|
|
||||||
|
|
||||||
class _Embed(Chain):
|
|
||||||
def __init__(self, nr_vector, nr_dim, nr_out, set_vectors=None):
|
|
||||||
Chain.__init__(self,
|
|
||||||
embed=L.EmbedID(nr_vector, nr_dim, initialW=set_vectors),
|
|
||||||
project=L.Linear(None, nr_out, nobias=True))
|
|
||||||
self.embed.W.volatile = False
|
|
||||||
|
|
||||||
def __call__(self, sentence):
|
|
||||||
return [self.project(self.embed(ts)) for ts in F.transpose(sentence)]
|
|
||||||
|
|
||||||
|
|
||||||
class _Encode(Chain):
|
|
||||||
def __init__(self, nr_in, nr_out):
|
|
||||||
Chain.__init__(self,
|
|
||||||
fwd=L.LSTM(nr_in, nr_out),
|
|
||||||
bwd=L.LSTM(nr_in, nr_out),
|
|
||||||
mix=L.Bilinear(nr_out, nr_out, nr_out))
|
|
||||||
|
|
||||||
def __call__(self, sentence):
|
|
||||||
self.fwd.reset_state()
|
|
||||||
fwds = map(self.fwd, sentence)
|
|
||||||
self.bwd.reset_state()
|
|
||||||
bwds = reversed(map(self.bwd, reversed(sentence)))
|
|
||||||
return [F.elu(self.mix(f, b)) for f, b in zip(fwds, bwds)]
|
|
||||||
|
|
||||||
|
|
||||||
class _Attend(Chain):
|
|
||||||
def __init__(self, nr_in, nr_out):
|
|
||||||
Chain.__init__(self)
|
|
||||||
|
|
||||||
def __call__(self, sentence):
|
|
||||||
sent = sum(sentence)
|
|
||||||
return sent
|
|
||||||
|
|
||||||
|
|
||||||
class _Predict(Chain):
|
|
||||||
def __init__(self, nr_in, nr_out):
|
|
||||||
Chain.__init__(self,
|
|
||||||
l1=L.Linear(nr_in, nr_in),
|
|
||||||
l2=L.Linear(nr_in, nr_out))
|
|
||||||
|
|
||||||
def __call__(self, vector):
|
|
||||||
vector = self.l1(vector)
|
|
||||||
vector = F.elu(vector)
|
|
||||||
vector = self.l2(vector)
|
|
||||||
return vector
|
|
||||||
|
|
||||||
|
|
||||||
class SentenceDataset(TupleDataset):
|
|
||||||
def __init__(self, nlp, texts, labels, max_length):
|
|
||||||
self.max_length = max_length
|
|
||||||
sents, labels = self._get_labelled_sentences(
|
|
||||||
nlp.pipe(texts, batch_size=5000, n_threads=3),
|
|
||||||
labels)
|
|
||||||
TupleDataset.__init__(self,
|
|
||||||
get_features(sents, max_length),
|
|
||||||
labels)
|
|
||||||
|
|
||||||
def __getitem__(self, index):
|
|
||||||
batches = [dataset[index] for dataset in self._datasets]
|
|
||||||
if isinstance(index, slice):
|
|
||||||
length = len(batches[0])
|
|
||||||
returns = [tuple([batch[i] for batch in batches])
|
|
||||||
for i in six.moves.range(length)]
|
|
||||||
return returns
|
|
||||||
else:
|
|
||||||
return tuple(batches)
|
|
||||||
|
|
||||||
def _get_labelled_sentences(self, docs, doc_labels):
|
|
||||||
labels = []
|
|
||||||
sentences = []
|
|
||||||
for doc, y in izip(docs, doc_labels):
|
|
||||||
for sent in doc.sents:
|
|
||||||
sentences.append(sent)
|
|
||||||
labels.append(y)
|
|
||||||
return sentences, xp.asarray(labels, dtype='i')
|
|
||||||
|
|
||||||
|
|
||||||
class DocDataset(TupleDataset):
|
|
||||||
def __init__(self, nlp, texts, labels):
|
|
||||||
self.max_length = max_length
|
|
||||||
DatasetMixin.__init__(self,
|
|
||||||
get_features(
|
|
||||||
nlp.pipe(texts, batch_size=5000, n_threads=3), self.max_length),
|
|
||||||
labels)
|
|
||||||
|
|
||||||
def read_data(data_dir, limit=0):
|
|
||||||
examples = []
|
|
||||||
for subdir, label in (('pos', 1), ('neg', 0)):
|
|
||||||
for filename in (data_dir / subdir).iterdir():
|
|
||||||
with filename.open() as file_:
|
|
||||||
text = file_.read()
|
|
||||||
examples.append((text, label))
|
|
||||||
random.shuffle(examples)
|
|
||||||
if limit >= 1:
|
|
||||||
examples = examples[:limit]
|
|
||||||
return zip(*examples) # Unzips into two lists
|
|
||||||
|
|
||||||
|
|
||||||
def get_features(docs, max_length):
|
|
||||||
docs = list(docs)
|
|
||||||
Xs = xp.zeros((len(docs), max_length), dtype='i')
|
|
||||||
for i, doc in enumerate(docs):
|
|
||||||
j = 0
|
|
||||||
for token in doc:
|
|
||||||
if token.has_vector and not token.is_punct and not token.is_space:
|
|
||||||
Xs[i, j] = token.norm
|
|
||||||
j += 1
|
|
||||||
if j >= max_length:
|
|
||||||
break
|
|
||||||
return Xs
|
|
||||||
|
|
||||||
|
|
||||||
def set_vectors(vectors, vocab):
|
|
||||||
for lex in vocab:
|
|
||||||
if lex.has_vector and (lex.rank+1) < vectors.shape[0]:
|
|
||||||
lex.norm = lex.rank+1
|
|
||||||
vectors[lex.rank + 1] = lex.vector
|
|
||||||
else:
|
|
||||||
lex.norm = 0
|
|
||||||
return vectors
|
|
||||||
|
|
||||||
|
|
||||||
def train(train_texts, train_labels, dev_texts, dev_labels,
|
|
||||||
lstm_shape, lstm_settings, lstm_optimizer, batch_size=100, nb_epoch=5,
|
|
||||||
by_sentence=True):
|
|
||||||
nlp = spacy.load('en', entity=False)
|
|
||||||
if 'nr_vector' not in lstm_shape:
|
|
||||||
lstm_shape['nr_vector'] = max(lex.rank+1 for lex in nlp.vocab if lex.has_vector)
|
|
||||||
if 'nr_dim' not in lstm_shape:
|
|
||||||
lstm_shape['nr_dim'] = nlp.vocab.vectors_length
|
|
||||||
print("Make model")
|
|
||||||
model = Classifier(SentimentModel(nlp, lstm_shape, **lstm_settings))
|
|
||||||
print("Parsing texts...")
|
|
||||||
if by_sentence:
|
|
||||||
train_data = SentenceDataset(nlp, train_texts, train_labels, lstm_shape['max_length'])
|
|
||||||
dev_data = SentenceDataset(nlp, dev_texts, dev_labels, lstm_shape['max_length'])
|
|
||||||
else:
|
|
||||||
train_data = DocDataset(nlp, train_texts, train_labels)
|
|
||||||
dev_data = DocDataset(nlp, dev_texts, dev_labels)
|
|
||||||
train_iter = SerialIterator(train_data, batch_size=batch_size,
|
|
||||||
shuffle=True, repeat=True)
|
|
||||||
dev_iter = SerialIterator(dev_data, batch_size=batch_size,
|
|
||||||
shuffle=False, repeat=False)
|
|
||||||
optimizer = chainer.optimizers.Adam()
|
|
||||||
optimizer.setup(model)
|
|
||||||
updater = chainer.training.StandardUpdater(train_iter, optimizer, device=0)
|
|
||||||
trainer = chainer.training.Trainer(updater, (1, 'epoch'), out='result')
|
|
||||||
|
|
||||||
trainer.extend(extensions.Evaluator(dev_iter, model, device=0))
|
|
||||||
trainer.extend(extensions.LogReport())
|
|
||||||
trainer.extend(extensions.PrintReport([
|
|
||||||
'epoch', 'main/accuracy', 'validation/main/accuracy']))
|
|
||||||
trainer.extend(extensions.ProgressBar())
|
|
||||||
|
|
||||||
trainer.run()
|
|
||||||
|
|
||||||
|
|
||||||
def evaluate(model_dir, texts, labels, max_length=100):
|
|
||||||
def create_pipeline(nlp):
|
|
||||||
'''
|
|
||||||
This could be a lambda, but named functions are easier to read in Python.
|
|
||||||
'''
|
|
||||||
return [nlp.tagger, nlp.parser, SentimentAnalyser.load(model_dir, nlp,
|
|
||||||
max_length=max_length)]
|
|
||||||
|
|
||||||
nlp = spacy.load('en')
|
|
||||||
nlp.pipeline = create_pipeline(nlp)
|
|
||||||
|
|
||||||
correct = 0
|
|
||||||
i = 0
|
|
||||||
for doc in nlp.pipe(texts, batch_size=1000, n_threads=4):
|
|
||||||
correct += bool(doc.sentiment >= 0.5) == bool(labels[i])
|
|
||||||
i += 1
|
|
||||||
return float(correct) / i
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
train_dir=("Location of training file or directory"),
|
|
||||||
dev_dir=("Location of development file or directory"),
|
|
||||||
model_dir=("Location of output model directory",),
|
|
||||||
is_runtime=("Demonstrate run-time usage", "flag", "r", bool),
|
|
||||||
nr_hidden=("Number of hidden units", "option", "H", int),
|
|
||||||
max_length=("Maximum sentence length", "option", "L", int),
|
|
||||||
dropout=("Dropout", "option", "d", float),
|
|
||||||
learn_rate=("Learn rate", "option", "e", float),
|
|
||||||
nb_epoch=("Number of training epochs", "option", "i", int),
|
|
||||||
batch_size=("Size of minibatches for training LSTM", "option", "b", int),
|
|
||||||
nr_examples=("Limit to N examples", "option", "n", int)
|
|
||||||
)
|
|
||||||
def main(model_dir, train_dir, dev_dir,
|
|
||||||
is_runtime=False,
|
|
||||||
nr_hidden=64, max_length=100, # Shape
|
|
||||||
dropout=0.5, learn_rate=0.001, # General NN config
|
|
||||||
nb_epoch=5, batch_size=32, nr_examples=-1): # Training params
|
|
||||||
model_dir = pathlib.Path(model_dir)
|
|
||||||
train_dir = pathlib.Path(train_dir)
|
|
||||||
dev_dir = pathlib.Path(dev_dir)
|
|
||||||
if is_runtime:
|
|
||||||
dev_texts, dev_labels = read_data(dev_dir)
|
|
||||||
acc = evaluate(model_dir, dev_texts, dev_labels, max_length=max_length)
|
|
||||||
print(acc)
|
|
||||||
else:
|
|
||||||
print("Read data")
|
|
||||||
train_texts, train_labels = read_data(train_dir, limit=nr_examples)
|
|
||||||
dev_texts, dev_labels = read_data(dev_dir, limit=nr_examples)
|
|
||||||
print("Using GPU 0")
|
|
||||||
#chainer.cuda.get_device(0).use()
|
|
||||||
train_labels = xp.asarray(train_labels, dtype='i')
|
|
||||||
dev_labels = xp.asarray(dev_labels, dtype='i')
|
|
||||||
lstm = train(train_texts, train_labels, dev_texts, dev_labels,
|
|
||||||
{'nr_hidden': nr_hidden, 'max_length': max_length, 'nr_class': 2,
|
|
||||||
'nr_vector': 5000},
|
|
||||||
{'dropout': 0.5, 'lr': learn_rate},
|
|
||||||
{},
|
|
||||||
nb_epoch=nb_epoch, batch_size=batch_size)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
#cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof")
|
|
||||||
#s = pstats.Stats("Profile.prof")
|
|
||||||
#s.strip_dirs().sort_stats("time").print_stats()
|
|
||||||
plac.call(main)
|
|
|
@ -20,72 +20,72 @@ The algorithm is O(n) at run-time for document of length n because we're only ev
|
||||||
matching over the tag patterns. So no matter how many phrases we're looking for,
|
matching over the tag patterns. So no matter how many phrases we're looking for,
|
||||||
our pattern set stays very small (exact size depends on the maximum length we're
|
our pattern set stays very small (exact size depends on the maximum length we're
|
||||||
looking for, as the query language currently has no quantifiers)
|
looking for, as the query language currently has no quantifiers)
|
||||||
|
|
||||||
|
The example expects a .bz2 file from the Reddit corpus, and a patterns file,
|
||||||
|
formatted in jsonl as a sequence of entries like this:
|
||||||
|
|
||||||
|
{"text":"Anchorage"}
|
||||||
|
{"text":"Angola"}
|
||||||
|
{"text":"Ann Arbor"}
|
||||||
|
{"text":"Annapolis"}
|
||||||
|
{"text":"Appalachia"}
|
||||||
|
{"text":"Argentina"}
|
||||||
"""
|
"""
|
||||||
from __future__ import print_function, unicode_literals, division
|
from __future__ import print_function, unicode_literals, division
|
||||||
from ast import literal_eval
|
|
||||||
from bz2 import BZ2File
|
from bz2 import BZ2File
|
||||||
import time
|
import time
|
||||||
import math
|
import math
|
||||||
import codecs
|
import codecs
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
|
import ujson
|
||||||
|
|
||||||
from preshed.maps import PreshMap
|
|
||||||
from preshed.counter import PreshCounter
|
|
||||||
from spacy.strings import hash_string
|
|
||||||
from spacy.en import English
|
|
||||||
from spacy.matcher import PhraseMatcher
|
from spacy.matcher import PhraseMatcher
|
||||||
|
import spacy
|
||||||
|
|
||||||
|
|
||||||
def read_gazetteer(tokenizer, loc, n=-1):
|
def read_gazetteer(tokenizer, loc, n=-1):
|
||||||
for i, line in enumerate(open(loc)):
|
for i, line in enumerate(open(loc)):
|
||||||
phrase = literal_eval('u' + line.strip())
|
data = ujson.loads(line.strip())
|
||||||
if ' (' in phrase and phrase.endswith(')'):
|
phrase = tokenizer(data['text'])
|
||||||
phrase = phrase.split(' (', 1)[0]
|
for w in phrase:
|
||||||
if i >= n:
|
_ = tokenizer.vocab[w.text]
|
||||||
break
|
|
||||||
phrase = tokenizer(phrase)
|
|
||||||
if all((t.is_lower and t.prob >= -10) for t in phrase):
|
|
||||||
continue
|
|
||||||
if len(phrase) >= 2:
|
if len(phrase) >= 2:
|
||||||
yield phrase
|
yield phrase
|
||||||
|
|
||||||
|
|
||||||
def read_text(bz2_loc):
|
def read_text(bz2_loc, n=10000):
|
||||||
with BZ2File(bz2_loc) as file_:
|
with BZ2File(bz2_loc) as file_:
|
||||||
for line in file_:
|
for i, line in enumerate(file_):
|
||||||
yield line.decode('utf8')
|
data = ujson.loads(line)
|
||||||
|
yield data['body']
|
||||||
|
if i >= n:
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
def get_matches(tokenizer, phrases, texts, max_length=6):
|
def get_matches(tokenizer, phrases, texts, max_length=6):
|
||||||
matcher = PhraseMatcher(tokenizer.vocab, phrases, max_length=max_length)
|
matcher = PhraseMatcher(tokenizer.vocab, max_length=max_length)
|
||||||
print("Match")
|
matcher.add('Phrase', None, *phrases)
|
||||||
for text in texts:
|
for text in texts:
|
||||||
doc = tokenizer(text)
|
doc = tokenizer(text)
|
||||||
|
for w in doc:
|
||||||
|
_ = doc.vocab[w.text]
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
for mwe in doc.ents:
|
for ent_id, start, end in matches:
|
||||||
yield mwe
|
yield (ent_id, doc[start:end].text)
|
||||||
|
|
||||||
|
|
||||||
def main(patterns_loc, text_loc, counts_loc, n=10000000):
|
def main(patterns_loc, text_loc, n=10000):
|
||||||
nlp = English(parser=False, tagger=False, entity=False)
|
nlp = spacy.blank('en')
|
||||||
print("Make matcher")
|
nlp.vocab.lex_attr_getters = {}
|
||||||
phrases = read_gazetteer(nlp.tokenizer, patterns_loc, n=n)
|
phrases = read_gazetteer(nlp.tokenizer, patterns_loc)
|
||||||
counts = PreshCounter()
|
count = 0
|
||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
for mwe in get_matches(nlp.tokenizer, phrases, read_text(text_loc)):
|
for ent_id, text in get_matches(nlp.tokenizer, phrases, read_text(text_loc, n=n)):
|
||||||
counts.inc(hash_string(mwe.text), 1)
|
count += 1
|
||||||
t2 = time.time()
|
t2 = time.time()
|
||||||
print("10m tokens in %d s" % (t2 - t1))
|
print("%d docs in %.3f s. %d matches" % (n, (t2 - t1), count))
|
||||||
|
|
||||||
with codecs.open(counts_loc, 'w', 'utf8') as file_:
|
|
||||||
for phrase in read_gazetteer(nlp.tokenizer, patterns_loc, n=n):
|
|
||||||
text = phrase.string
|
|
||||||
key = hash_string(text)
|
|
||||||
count = counts[key]
|
|
||||||
if count != 0:
|
|
||||||
file_.write('%d\t%s\n' % (count, text))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
if False:
|
if False:
|
|
@ -13,24 +13,29 @@ Input data:
|
||||||
https://www.lt.informatik.tu-darmstadt.de/fileadmin/user_upload/Group_LangTech/data/GermEval2014_complete_data.zip
|
https://www.lt.informatik.tu-darmstadt.de/fileadmin/user_upload/Group_LangTech/data/GermEval2014_complete_data.zip
|
||||||
|
|
||||||
Developed for: spaCy 1.7.1
|
Developed for: spaCy 1.7.1
|
||||||
Last tested for: spaCy 1.7.1
|
Last tested for: spaCy 2.0.0a13
|
||||||
'''
|
'''
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
import plac
|
import plac
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import random
|
import random
|
||||||
import json
|
import json
|
||||||
|
import tqdm
|
||||||
|
|
||||||
|
from thinc.neural.optimizers import Adam
|
||||||
|
from thinc.neural.ops import NumpyOps
|
||||||
|
|
||||||
import spacy.orth as orth_funcs
|
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.pipeline import BeamEntityRecognizer
|
from spacy.pipeline import TokenVectorEncoder, NeuralEntityRecognizer
|
||||||
from spacy.pipeline import EntityRecognizer
|
|
||||||
from spacy.tokenizer import Tokenizer
|
from spacy.tokenizer import Tokenizer
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.attrs import *
|
from spacy.attrs import *
|
||||||
from spacy.gold import GoldParse
|
from spacy.gold import GoldParse
|
||||||
from spacy.gold import _iob_to_biluo as iob_to_biluo
|
from spacy.gold import iob_to_biluo
|
||||||
|
from spacy.gold import minibatch
|
||||||
from spacy.scorer import Scorer
|
from spacy.scorer import Scorer
|
||||||
|
import spacy.util
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
unicode
|
unicode
|
||||||
|
@ -38,96 +43,38 @@ except NameError:
|
||||||
unicode = str
|
unicode = str
|
||||||
|
|
||||||
|
|
||||||
|
spacy.util.set_env_log(True)
|
||||||
|
|
||||||
|
|
||||||
def init_vocab():
|
def init_vocab():
|
||||||
return Vocab(
|
return Vocab(
|
||||||
lex_attr_getters={
|
lex_attr_getters={
|
||||||
LOWER: lambda string: string.lower(),
|
LOWER: lambda string: string.lower(),
|
||||||
SHAPE: orth_funcs.word_shape,
|
NORM: lambda string: string.lower(),
|
||||||
PREFIX: lambda string: string[0],
|
PREFIX: lambda string: string[0],
|
||||||
SUFFIX: lambda string: string[-3:],
|
SUFFIX: lambda string: string[-3:],
|
||||||
CLUSTER: lambda string: 0,
|
|
||||||
IS_ALPHA: orth_funcs.is_alpha,
|
|
||||||
IS_ASCII: orth_funcs.is_ascii,
|
|
||||||
IS_DIGIT: lambda string: string.isdigit(),
|
|
||||||
IS_LOWER: orth_funcs.is_lower,
|
|
||||||
IS_PUNCT: orth_funcs.is_punct,
|
|
||||||
IS_SPACE: lambda string: string.isspace(),
|
|
||||||
IS_TITLE: orth_funcs.is_title,
|
|
||||||
IS_UPPER: orth_funcs.is_upper,
|
|
||||||
IS_STOP: lambda string: False,
|
|
||||||
IS_OOV: lambda string: True
|
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
def save_vocab(vocab, path):
|
|
||||||
path = Path(path)
|
|
||||||
if not path.exists():
|
|
||||||
path.mkdir()
|
|
||||||
elif not path.is_dir():
|
|
||||||
raise IOError("Can't save vocab to %s\nNot a directory" % path)
|
|
||||||
with (path / 'strings.json').open('w') as file_:
|
|
||||||
vocab.strings.dump(file_)
|
|
||||||
vocab.dump((path / 'lexemes.bin').as_posix())
|
|
||||||
|
|
||||||
|
|
||||||
def load_vocab(path):
|
|
||||||
path = Path(path)
|
|
||||||
if not path.exists():
|
|
||||||
raise IOError("Cannot load vocab from %s\nDoes not exist" % path)
|
|
||||||
if not path.is_dir():
|
|
||||||
raise IOError("Cannot load vocab from %s\nNot a directory" % path)
|
|
||||||
return Vocab.load(path)
|
|
||||||
|
|
||||||
|
|
||||||
def init_ner_model(vocab, features=None):
|
|
||||||
if features is None:
|
|
||||||
features = tuple(EntityRecognizer.feature_templates)
|
|
||||||
return EntityRecognizer(vocab, features=features)
|
|
||||||
|
|
||||||
|
|
||||||
def save_ner_model(model, path):
|
|
||||||
path = Path(path)
|
|
||||||
if not path.exists():
|
|
||||||
path.mkdir()
|
|
||||||
if not path.is_dir():
|
|
||||||
raise IOError("Can't save model to %s\nNot a directory" % path)
|
|
||||||
model.model.dump((path / 'model').as_posix())
|
|
||||||
with (path / 'config.json').open('w') as file_:
|
|
||||||
data = json.dumps(model.cfg)
|
|
||||||
if not isinstance(data, unicode):
|
|
||||||
data = data.decode('utf8')
|
|
||||||
file_.write(data)
|
|
||||||
|
|
||||||
|
|
||||||
def load_ner_model(vocab, path):
|
|
||||||
return EntityRecognizer.load(path, vocab)
|
|
||||||
|
|
||||||
|
|
||||||
class Pipeline(object):
|
class Pipeline(object):
|
||||||
@classmethod
|
|
||||||
def load(cls, path):
|
|
||||||
path = Path(path)
|
|
||||||
if not path.exists():
|
|
||||||
raise IOError("Cannot load pipeline from %s\nDoes not exist" % path)
|
|
||||||
if not path.is_dir():
|
|
||||||
raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
|
|
||||||
vocab = load_vocab(path)
|
|
||||||
tokenizer = Tokenizer(vocab, {}, None, None, None)
|
|
||||||
ner_model = load_ner_model(vocab, path / 'ner')
|
|
||||||
return cls(vocab, tokenizer, ner_model)
|
|
||||||
|
|
||||||
def __init__(self, vocab=None, tokenizer=None, entity=None):
|
def __init__(self, vocab=None, tokenizer=None, entity=None):
|
||||||
if vocab is None:
|
if vocab is None:
|
||||||
vocab = init_vocab()
|
vocab = init_vocab()
|
||||||
if tokenizer is None:
|
if tokenizer is None:
|
||||||
tokenizer = Tokenizer(vocab, {}, None, None, None)
|
tokenizer = Tokenizer(vocab, {}, None, None, None)
|
||||||
if entity is None:
|
if entity is None:
|
||||||
entity = init_ner_model(self.vocab)
|
entity = NeuralEntityRecognizer(vocab)
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.tokenizer = tokenizer
|
self.tokenizer = tokenizer
|
||||||
self.entity = entity
|
self.entity = entity
|
||||||
self.pipeline = [self.entity]
|
self.pipeline = [self.entity]
|
||||||
|
|
||||||
|
def begin_training(self):
|
||||||
|
for model in self.pipeline:
|
||||||
|
model.begin_training([])
|
||||||
|
optimizer = Adam(NumpyOps(), 0.001)
|
||||||
|
return optimizer
|
||||||
|
|
||||||
def __call__(self, input_):
|
def __call__(self, input_):
|
||||||
doc = self.make_doc(input_)
|
doc = self.make_doc(input_)
|
||||||
for process in self.pipeline:
|
for process in self.pipeline:
|
||||||
|
@ -147,14 +94,16 @@ class Pipeline(object):
|
||||||
gold = GoldParse(doc, entities=annotations)
|
gold = GoldParse(doc, entities=annotations)
|
||||||
return gold
|
return gold
|
||||||
|
|
||||||
def update(self, input_, annot):
|
def update(self, inputs, annots, sgd, losses=None, drop=0.):
|
||||||
doc = self.make_doc(input_)
|
if losses is None:
|
||||||
gold = self.make_gold(input_, annot)
|
losses = {}
|
||||||
for ner in gold.ner:
|
docs = [self.make_doc(input_) for input_ in inputs]
|
||||||
if ner not in (None, '-', 'O'):
|
golds = [self.make_gold(input_, annot) for input_, annot in
|
||||||
action, label = ner.split('-', 1)
|
zip(inputs, annots)]
|
||||||
self.entity.add_label(label)
|
|
||||||
return self.entity.update(doc, gold)
|
self.entity.update(docs, golds, drop=drop,
|
||||||
|
sgd=sgd, losses=losses)
|
||||||
|
return losses
|
||||||
|
|
||||||
def evaluate(self, examples):
|
def evaluate(self, examples):
|
||||||
scorer = Scorer()
|
scorer = Scorer()
|
||||||
|
@ -164,34 +113,36 @@ class Pipeline(object):
|
||||||
scorer.score(doc, gold)
|
scorer.score(doc, gold)
|
||||||
return scorer.scores
|
return scorer.scores
|
||||||
|
|
||||||
def average_weights(self):
|
def to_disk(self, path):
|
||||||
self.entity.model.end_training()
|
|
||||||
|
|
||||||
def save(self, path):
|
|
||||||
path = Path(path)
|
path = Path(path)
|
||||||
if not path.exists():
|
if not path.exists():
|
||||||
path.mkdir()
|
path.mkdir()
|
||||||
elif not path.is_dir():
|
elif not path.is_dir():
|
||||||
raise IOError("Can't save pipeline to %s\nNot a directory" % path)
|
raise IOError("Can't save pipeline to %s\nNot a directory" % path)
|
||||||
save_vocab(self.vocab, path / 'vocab')
|
self.vocab.to_disk(path / 'vocab')
|
||||||
save_ner_model(self.entity, path / 'ner')
|
self.entity.to_disk(path / 'ner')
|
||||||
|
|
||||||
|
def from_disk(self, path):
|
||||||
|
path = Path(path)
|
||||||
|
if not path.exists():
|
||||||
|
raise IOError("Cannot load pipeline from %s\nDoes not exist" % path)
|
||||||
|
if not path.is_dir():
|
||||||
|
raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
|
||||||
|
self.vocab = self.vocab.from_disk(path / 'vocab')
|
||||||
|
self.entity = self.entity.from_disk(path / 'ner')
|
||||||
|
|
||||||
|
|
||||||
def train(nlp, train_examples, dev_examples, ctx, nr_epoch=5):
|
def train(nlp, train_examples, dev_examples, nr_epoch=5):
|
||||||
next_epoch = train_examples
|
sgd = nlp.begin_training()
|
||||||
print("Iter", "Loss", "P", "R", "F")
|
print("Iter", "Loss", "P", "R", "F")
|
||||||
for i in range(nr_epoch):
|
for i in range(nr_epoch):
|
||||||
this_epoch = next_epoch
|
random.shuffle(train_examples)
|
||||||
next_epoch = []
|
losses = {}
|
||||||
loss = 0
|
for batch in minibatch(tqdm.tqdm(train_examples, leave=False), size=8):
|
||||||
for input_, annot in this_epoch:
|
inputs, annots = zip(*batch)
|
||||||
loss += nlp.update(input_, annot)
|
nlp.update(list(inputs), list(annots), sgd, losses=losses)
|
||||||
if (i+1) < nr_epoch:
|
|
||||||
next_epoch.append((input_, annot))
|
|
||||||
random.shuffle(next_epoch)
|
|
||||||
scores = nlp.evaluate(dev_examples)
|
scores = nlp.evaluate(dev_examples)
|
||||||
report_scores(i, loss, scores)
|
report_scores(i, losses['ner'], scores)
|
||||||
nlp.average_weights()
|
|
||||||
scores = nlp.evaluate(dev_examples)
|
scores = nlp.evaluate(dev_examples)
|
||||||
report_scores(channels, i+1, loss, scores)
|
report_scores(channels, i+1, loss, scores)
|
||||||
|
|
||||||
|
@ -208,7 +159,8 @@ def read_examples(path):
|
||||||
with path.open() as file_:
|
with path.open() as file_:
|
||||||
sents = file_.read().strip().split('\n\n')
|
sents = file_.read().strip().split('\n\n')
|
||||||
for sent in sents:
|
for sent in sents:
|
||||||
if not sent.strip():
|
sent = sent.strip()
|
||||||
|
if not sent:
|
||||||
continue
|
continue
|
||||||
tokens = sent.split('\n')
|
tokens = sent.split('\n')
|
||||||
while tokens and tokens[0].startswith('#'):
|
while tokens and tokens[0].startswith('#'):
|
||||||
|
@ -217,28 +169,39 @@ def read_examples(path):
|
||||||
iob = []
|
iob = []
|
||||||
for token in tokens:
|
for token in tokens:
|
||||||
if token.strip():
|
if token.strip():
|
||||||
pieces = token.split()
|
pieces = token.split('\t')
|
||||||
words.append(pieces[1])
|
words.append(pieces[1])
|
||||||
iob.append(pieces[2])
|
iob.append(pieces[2])
|
||||||
yield words, iob_to_biluo(iob)
|
yield words, iob_to_biluo(iob)
|
||||||
|
|
||||||
|
|
||||||
|
def get_labels(examples):
|
||||||
|
labels = set()
|
||||||
|
for words, tags in examples:
|
||||||
|
for tag in tags:
|
||||||
|
if '-' in tag:
|
||||||
|
labels.add(tag.split('-')[1])
|
||||||
|
return sorted(labels)
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
model_dir=("Path to save the model", "positional", None, Path),
|
model_dir=("Path to save the model", "positional", None, Path),
|
||||||
train_loc=("Path to your training data", "positional", None, Path),
|
train_loc=("Path to your training data", "positional", None, Path),
|
||||||
dev_loc=("Path to your development data", "positional", None, Path),
|
dev_loc=("Path to your development data", "positional", None, Path),
|
||||||
)
|
)
|
||||||
def main(model_dir=Path('/home/matt/repos/spaCy/spacy/data/de-1.0.0'),
|
def main(model_dir, train_loc, dev_loc, nr_epoch=30):
|
||||||
train_loc=None, dev_loc=None, nr_epoch=30):
|
print(model_dir, train_loc, dev_loc)
|
||||||
|
train_examples = list(read_examples(train_loc))
|
||||||
train_examples = read_examples(train_loc)
|
|
||||||
dev_examples = read_examples(dev_loc)
|
dev_examples = read_examples(dev_loc)
|
||||||
nlp = Pipeline.load(model_dir)
|
nlp = Pipeline()
|
||||||
|
for label in get_labels(train_examples):
|
||||||
|
nlp.entity.add_label(label)
|
||||||
|
print("Add label", label)
|
||||||
|
|
||||||
train(nlp, train_examples, list(dev_examples), ctx, nr_epoch)
|
train(nlp, train_examples, list(dev_examples), nr_epoch)
|
||||||
|
|
||||||
nlp.save(model_dir)
|
nlp.to_disk(model_dir)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
plac.call(main)
|
||||||
|
|
|
@ -25,7 +25,7 @@ For more details, see the documentation:
|
||||||
* Saving and loading models: https://spacy.io/docs/usage/saving-loading
|
* Saving and loading models: https://spacy.io/docs/usage/saving-loading
|
||||||
|
|
||||||
Developed for: spaCy 1.7.6
|
Developed for: spaCy 1.7.6
|
||||||
Last tested for: spaCy 1.7.6
|
Last updated for: spaCy 2.0.0a13
|
||||||
"""
|
"""
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
|
@ -34,55 +34,41 @@ from pathlib import Path
|
||||||
import random
|
import random
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.gold import GoldParse
|
from spacy.gold import GoldParse, minibatch
|
||||||
from spacy.tagger import Tagger
|
from spacy.pipeline import NeuralEntityRecognizer
|
||||||
|
from spacy.pipeline import TokenVectorEncoder
|
||||||
|
|
||||||
|
|
||||||
|
def get_gold_parses(tokenizer, train_data):
|
||||||
|
'''Shuffle and create GoldParse objects'''
|
||||||
|
random.shuffle(train_data)
|
||||||
|
for raw_text, entity_offsets in train_data:
|
||||||
|
doc = tokenizer(raw_text)
|
||||||
|
gold = GoldParse(doc, entities=entity_offsets)
|
||||||
|
yield doc, gold
|
||||||
|
|
||||||
|
|
||||||
def train_ner(nlp, train_data, output_dir):
|
def train_ner(nlp, train_data, output_dir):
|
||||||
# Add new words to vocab
|
|
||||||
for raw_text, _ in train_data:
|
|
||||||
doc = nlp.make_doc(raw_text)
|
|
||||||
for word in doc:
|
|
||||||
_ = nlp.vocab[word.orth]
|
|
||||||
random.seed(0)
|
random.seed(0)
|
||||||
# You may need to change the learning rate. It's generally difficult to
|
optimizer = nlp.begin_training(lambda: [])
|
||||||
# guess what rate you should set, especially when you have limited data.
|
nlp.meta['name'] = 'en_ent_animal'
|
||||||
nlp.entity.model.learn_rate = 0.001
|
for itn in range(50):
|
||||||
for itn in range(1000):
|
losses = {}
|
||||||
random.shuffle(train_data)
|
for batch in minibatch(get_gold_parses(nlp.make_doc, train_data), size=3):
|
||||||
loss = 0.
|
docs, golds = zip(*batch)
|
||||||
for raw_text, entity_offsets in train_data:
|
nlp.update(docs, golds, losses=losses, sgd=optimizer, update_shared=True,
|
||||||
gold = GoldParse(doc, entities=entity_offsets)
|
drop=0.35)
|
||||||
# By default, the GoldParse class assumes that the entities
|
print(losses)
|
||||||
# described by offset are complete, and all other words should
|
if not output_dir:
|
||||||
# have the tag 'O'. You can tell it to make no assumptions
|
return
|
||||||
# about the tag of a word by giving it the tag '-'.
|
elif not output_dir.exists():
|
||||||
# However, this allows a trivial solution to the current
|
output_dir.mkdir()
|
||||||
# learning problem: if words are either 'any tag' or 'ANIMAL',
|
nlp.to_disk(output_dir)
|
||||||
# the model can learn that all words can be tagged 'ANIMAL'.
|
|
||||||
#for i in range(len(gold.ner)):
|
|
||||||
#if not gold.ner[i].endswith('ANIMAL'):
|
|
||||||
# gold.ner[i] = '-'
|
|
||||||
doc = nlp.make_doc(raw_text)
|
|
||||||
nlp.tagger(doc)
|
|
||||||
# As of 1.9, spaCy's parser now lets you supply a dropout probability
|
|
||||||
# This might help the model generalize better from only a few
|
|
||||||
# examples.
|
|
||||||
loss += nlp.entity.update(doc, gold, drop=0.9)
|
|
||||||
if loss == 0:
|
|
||||||
break
|
|
||||||
# This step averages the model's weights. This may or may not be good for
|
|
||||||
# your situation --- it's empirical.
|
|
||||||
nlp.end_training()
|
|
||||||
if output_dir:
|
|
||||||
if not output_dir.exists():
|
|
||||||
output_dir.mkdir()
|
|
||||||
nlp.save_to_directory(output_dir)
|
|
||||||
|
|
||||||
|
|
||||||
def main(model_name, output_directory=None):
|
def main(model_name, output_directory=None):
|
||||||
print("Loading initial model", model_name)
|
print("Creating initial model", model_name)
|
||||||
nlp = spacy.load(model_name)
|
nlp = spacy.blank(model_name)
|
||||||
if output_directory is not None:
|
if output_directory is not None:
|
||||||
output_directory = Path(output_directory)
|
output_directory = Path(output_directory)
|
||||||
|
|
||||||
|
@ -91,6 +77,11 @@ def main(model_name, output_directory=None):
|
||||||
"Horses are too tall and they pretend to care about your feelings",
|
"Horses are too tall and they pretend to care about your feelings",
|
||||||
[(0, 6, 'ANIMAL')],
|
[(0, 6, 'ANIMAL')],
|
||||||
),
|
),
|
||||||
|
(
|
||||||
|
"Do they bite?",
|
||||||
|
[],
|
||||||
|
),
|
||||||
|
|
||||||
(
|
(
|
||||||
"horses are too tall and they pretend to care about your feelings",
|
"horses are too tall and they pretend to care about your feelings",
|
||||||
[(0, 6, 'ANIMAL')]
|
[(0, 6, 'ANIMAL')]
|
||||||
|
@ -109,18 +100,20 @@ def main(model_name, output_directory=None):
|
||||||
)
|
)
|
||||||
|
|
||||||
]
|
]
|
||||||
nlp.entity.add_label('ANIMAL')
|
nlp.pipeline.append(TokenVectorEncoder(nlp.vocab))
|
||||||
|
nlp.pipeline.append(NeuralEntityRecognizer(nlp.vocab))
|
||||||
|
nlp.pipeline[-1].add_label('ANIMAL')
|
||||||
train_ner(nlp, train_data, output_directory)
|
train_ner(nlp, train_data, output_directory)
|
||||||
|
|
||||||
# Test that the entity is recognized
|
# Test that the entity is recognized
|
||||||
doc = nlp('Do you like horses?')
|
text = 'Do you like horses?'
|
||||||
print("Ents in 'Do you like horses?':")
|
print("Ents in 'Do you like horses?':")
|
||||||
|
doc = nlp(text)
|
||||||
for ent in doc.ents:
|
for ent in doc.ents:
|
||||||
print(ent.label_, ent.text)
|
print(ent.label_, ent.text)
|
||||||
if output_directory:
|
if output_directory:
|
||||||
print("Loading from", output_directory)
|
print("Loading from", output_directory)
|
||||||
nlp2 = spacy.load('en', path=output_directory)
|
nlp2 = spacy.load(output_directory)
|
||||||
nlp2.entity.add_label('ANIMAL')
|
|
||||||
doc2 = nlp2('Do you like horses?')
|
doc2 = nlp2('Do you like horses?')
|
||||||
for ent in doc2.ents:
|
for ent in doc2.ents:
|
||||||
print(ent.label_, ent.text)
|
print(ent.label_, ent.text)
|
||||||
|
|
|
@ -1,3 +1,7 @@
|
||||||
|
'''Train a multi-label convolutional neural network text classifier,
|
||||||
|
using the spacy.pipeline.TextCategorizer component. The model is then added
|
||||||
|
to spacy.pipeline, and predictions are available at `doc.cats`.
|
||||||
|
'''
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
import plac
|
import plac
|
||||||
import random
|
import random
|
||||||
|
@ -12,6 +16,11 @@ from spacy.gold import GoldParse, minibatch
|
||||||
from spacy.util import compounding
|
from spacy.util import compounding
|
||||||
from spacy.pipeline import TextCategorizer
|
from spacy.pipeline import TextCategorizer
|
||||||
|
|
||||||
|
# TODO: Remove this once we're not supporting models trained with thinc <6.9.0
|
||||||
|
import thinc.neural._classes.layernorm
|
||||||
|
thinc.neural._classes.layernorm.set_compat_six_eight(False)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def train_textcat(tokenizer, textcat,
|
def train_textcat(tokenizer, textcat,
|
||||||
train_texts, train_cats, dev_texts, dev_cats,
|
train_texts, train_cats, dev_texts, dev_cats,
|
||||||
|
@ -24,14 +33,15 @@ def train_textcat(tokenizer, textcat,
|
||||||
train_docs = [tokenizer(text) for text in train_texts]
|
train_docs = [tokenizer(text) for text in train_texts]
|
||||||
train_gold = [GoldParse(doc, cats=cats) for doc, cats in
|
train_gold = [GoldParse(doc, cats=cats) for doc, cats in
|
||||||
zip(train_docs, train_cats)]
|
zip(train_docs, train_cats)]
|
||||||
train_data = zip(train_docs, train_gold)
|
train_data = list(zip(train_docs, train_gold))
|
||||||
batch_sizes = compounding(4., 128., 1.001)
|
batch_sizes = compounding(4., 128., 1.001)
|
||||||
for i in range(n_iter):
|
for i in range(n_iter):
|
||||||
losses = {}
|
losses = {}
|
||||||
train_data = tqdm.tqdm(train_data, leave=False) # Progress bar
|
# Progress bar and minibatching
|
||||||
for batch in minibatch(train_data, size=batch_sizes):
|
batches = minibatch(tqdm.tqdm(train_data, leave=False), size=batch_sizes)
|
||||||
|
for batch in batches:
|
||||||
docs, golds = zip(*batch)
|
docs, golds = zip(*batch)
|
||||||
textcat.update((docs, None), golds, sgd=optimizer, drop=0.2,
|
textcat.update(docs, golds, sgd=optimizer, drop=0.2,
|
||||||
losses=losses)
|
losses=losses)
|
||||||
with textcat.model.use_params(optimizer.averages):
|
with textcat.model.use_params(optimizer.averages):
|
||||||
scores = evaluate(tokenizer, textcat, dev_texts, dev_cats)
|
scores = evaluate(tokenizer, textcat, dev_texts, dev_cats)
|
||||||
|
@ -61,12 +71,13 @@ def evaluate(tokenizer, textcat, texts, cats):
|
||||||
return {'textcat_p': precis, 'textcat_r': recall, 'textcat_f': fscore}
|
return {'textcat_p': precis, 'textcat_r': recall, 'textcat_f': fscore}
|
||||||
|
|
||||||
|
|
||||||
def load_data():
|
def load_data(limit=0):
|
||||||
# Partition off part of the train data --- avoid running experiments
|
# Partition off part of the train data --- avoid running experiments
|
||||||
# against test.
|
# against test.
|
||||||
train_data, _ = thinc.extra.datasets.imdb()
|
train_data, _ = thinc.extra.datasets.imdb()
|
||||||
|
|
||||||
random.shuffle(train_data)
|
random.shuffle(train_data)
|
||||||
|
train_data = train_data[-limit:]
|
||||||
|
|
||||||
texts, labels = zip(*train_data)
|
texts, labels = zip(*train_data)
|
||||||
cats = [(['POSITIVE'] if y else []) for y in labels]
|
cats = [(['POSITIVE'] if y else []) for y in labels]
|
||||||
|
@ -86,7 +97,7 @@ def main(model_loc=None):
|
||||||
textcat = TextCategorizer(tokenizer.vocab, labels=['POSITIVE'])
|
textcat = TextCategorizer(tokenizer.vocab, labels=['POSITIVE'])
|
||||||
|
|
||||||
print("Load IMDB data")
|
print("Load IMDB data")
|
||||||
(train_texts, train_cats), (dev_texts, dev_cats) = load_data()
|
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=1000)
|
||||||
|
|
||||||
print("Itn.\tLoss\tP\tR\tF")
|
print("Itn.\tLoss\tP\tR\tF")
|
||||||
progress = '{i:d} {loss:.3f} {textcat_p:.3f} {textcat_r:.3f} {textcat_f:.3f}'
|
progress = '{i:d} {loss:.3f} {textcat_p:.3f} {textcat_r:.3f} {textcat_f:.3f}'
|
||||||
|
|
30
examples/vectors_fast_text.py
Normal file
30
examples/vectors_fast_text.py
Normal file
|
@ -0,0 +1,30 @@
|
||||||
|
'''Load vectors for a language trained using FastText
|
||||||
|
|
||||||
|
https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
|
||||||
|
'''
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
import plac
|
||||||
|
import numpy
|
||||||
|
|
||||||
|
import spacy.language
|
||||||
|
|
||||||
|
|
||||||
|
def main(vectors_loc):
|
||||||
|
nlp = spacy.language.Language()
|
||||||
|
|
||||||
|
with open(vectors_loc, 'rb') as file_:
|
||||||
|
header = file_.readline()
|
||||||
|
nr_row, nr_dim = header.split()
|
||||||
|
nlp.vocab.clear_vectors(int(nr_dim))
|
||||||
|
for line in file_:
|
||||||
|
line = line.decode('utf8')
|
||||||
|
pieces = line.split()
|
||||||
|
word = pieces[0]
|
||||||
|
vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
|
||||||
|
nlp.vocab.set_vector(word, vector)
|
||||||
|
doc = nlp(u'class colspan')
|
||||||
|
print(doc[0].similarity(doc[1]))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
plac.call(main)
|
5
fabfile.py
vendored
5
fabfile.py
vendored
|
@ -14,6 +14,7 @@ VENV_DIR = path.join(PWD, ENV)
|
||||||
def env(lang='python2.7'):
|
def env(lang='python2.7'):
|
||||||
if path.exists(VENV_DIR):
|
if path.exists(VENV_DIR):
|
||||||
local('rm -rf {env}'.format(env=VENV_DIR))
|
local('rm -rf {env}'.format(env=VENV_DIR))
|
||||||
|
local('pip install virtualenv')
|
||||||
local('python -m virtualenv -p {lang} {env}'.format(lang=lang, env=VENV_DIR))
|
local('python -m virtualenv -p {lang} {env}'.format(lang=lang, env=VENV_DIR))
|
||||||
|
|
||||||
|
|
||||||
|
@ -32,6 +33,10 @@ def make():
|
||||||
local('pip install -r requirements.txt')
|
local('pip install -r requirements.txt')
|
||||||
local('python setup.py build_ext --inplace')
|
local('python setup.py build_ext --inplace')
|
||||||
|
|
||||||
|
def sdist():
|
||||||
|
with virtualenv(VENV_DIR):
|
||||||
|
with lcd(path.dirname(__file__)):
|
||||||
|
local('python setup.py sdist')
|
||||||
|
|
||||||
def clean():
|
def clean():
|
||||||
with lcd(path.dirname(__file__)):
|
with lcd(path.dirname(__file__)):
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
cython<0.24
|
cython>=0.24,<0.27.0
|
||||||
pathlib
|
pathlib
|
||||||
numpy>=1.7
|
numpy>=1.7
|
||||||
cymem>=1.30,<1.32
|
cymem>=1.30,<1.32
|
||||||
preshed>=1.0.0,<2.0.0
|
preshed>=1.0.0,<2.0.0
|
||||||
thinc>=6.8.0,<6.9.0
|
thinc>=6.9.0,<6.10.0
|
||||||
murmurhash>=0.28,<0.29
|
murmurhash>=0.28,<0.29
|
||||||
plac<1.0.0,>=0.9.6
|
plac<1.0.0,>=0.9.6
|
||||||
six
|
six
|
||||||
|
@ -13,7 +13,7 @@ requests>=2.13.0,<3.0.0
|
||||||
regex==2017.4.5
|
regex==2017.4.5
|
||||||
ftfy>=4.4.2,<5.0.0
|
ftfy>=4.4.2,<5.0.0
|
||||||
pytest>=3.0.6,<4.0.0
|
pytest>=3.0.6,<4.0.0
|
||||||
pip>=9.0.0,<10.0.0
|
|
||||||
mock>=2.0.0,<3.0.0
|
mock>=2.0.0,<3.0.0
|
||||||
msgpack-python
|
msgpack-python
|
||||||
msgpack-numpy
|
msgpack-numpy
|
||||||
|
html5lib==1.0b8
|
||||||
|
|
3
setup.py
3
setup.py
|
@ -195,9 +195,8 @@ def setup_package():
|
||||||
'murmurhash>=0.28,<0.29',
|
'murmurhash>=0.28,<0.29',
|
||||||
'cymem>=1.30,<1.32',
|
'cymem>=1.30,<1.32',
|
||||||
'preshed>=1.0.0,<2.0.0',
|
'preshed>=1.0.0,<2.0.0',
|
||||||
'thinc>=6.8.0,<6.9.0',
|
'thinc>=6.9.0,<6.10.0',
|
||||||
'plac<1.0.0,>=0.9.6',
|
'plac<1.0.0,>=0.9.6',
|
||||||
'pip>=9.0.0,<10.0.0',
|
|
||||||
'six',
|
'six',
|
||||||
'pathlib',
|
'pathlib',
|
||||||
'ujson>=1.35',
|
'ujson>=1.35',
|
||||||
|
|
|
@ -4,11 +4,13 @@ from __future__ import unicode_literals
|
||||||
from .cli.info import info as cli_info
|
from .cli.info import info as cli_info
|
||||||
from .glossary import explain
|
from .glossary import explain
|
||||||
from .deprecated import resolve_load_name
|
from .deprecated import resolve_load_name
|
||||||
|
#from .about import __version__
|
||||||
from .about import __version__
|
from .about import __version__
|
||||||
from . import util
|
from . import util
|
||||||
|
|
||||||
|
|
||||||
def load(name, **overrides):
|
def load(name, **overrides):
|
||||||
|
from .deprecated import resolve_load_name
|
||||||
name = resolve_load_name(name, **overrides)
|
name = resolve_load_name(name, **overrides)
|
||||||
return util.load_model(name, **overrides)
|
return util.load_model(name, **overrides)
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,7 @@ if __name__ == '__main__':
|
||||||
import plac
|
import plac
|
||||||
import sys
|
import sys
|
||||||
from spacy.cli import download, link, info, package, train, convert, model
|
from spacy.cli import download, link, info, package, train, convert, model
|
||||||
from spacy.cli import profile
|
from spacy.cli import profile, evaluate
|
||||||
from spacy.util import prints
|
from spacy.util import prints
|
||||||
|
|
||||||
commands = {
|
commands = {
|
||||||
|
@ -15,6 +15,7 @@ if __name__ == '__main__':
|
||||||
'link': link,
|
'link': link,
|
||||||
'info': info,
|
'info': info,
|
||||||
'train': train,
|
'train': train,
|
||||||
|
'evaluate': evaluate,
|
||||||
'convert': convert,
|
'convert': convert,
|
||||||
'package': package,
|
'package': package,
|
||||||
'model': model,
|
'model': model,
|
||||||
|
|
161
spacy/_ml.py
161
spacy/_ml.py
|
@ -1,28 +1,27 @@
|
||||||
import ujson
|
import ujson
|
||||||
|
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
|
||||||
|
from thinc.i2v import HashEmbed, StaticVectors
|
||||||
|
from thinc.t2t import ExtractWindow, ParametricAttention
|
||||||
|
from thinc.t2v import Pooling, max_pool, mean_pool, sum_pool
|
||||||
|
from thinc.misc import Residual
|
||||||
|
from thinc.misc import BatchNorm as BN
|
||||||
|
from thinc.misc import LayerNorm as LN
|
||||||
|
|
||||||
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
|
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
|
||||||
from thinc.neural import Model, Maxout, Softmax, Affine
|
from thinc.api import FeatureExtracter, with_getitem
|
||||||
from thinc.neural._classes.hash_embed import HashEmbed
|
from thinc.api import uniqued, wrap, flatten_add_lengths, noop
|
||||||
|
|
||||||
|
from thinc.linear.linear import LinearModel
|
||||||
from thinc.neural.ops import NumpyOps, CupyOps
|
from thinc.neural.ops import NumpyOps, CupyOps
|
||||||
from thinc.neural.util import get_array_module
|
from thinc.neural.util import get_array_module
|
||||||
|
|
||||||
import random
|
import random
|
||||||
import cytoolz
|
import cytoolz
|
||||||
|
|
||||||
from thinc.neural._classes.convolution import ExtractWindow
|
|
||||||
from thinc.neural._classes.static_vectors import StaticVectors
|
|
||||||
from thinc.neural._classes.batchnorm import BatchNorm as BN
|
|
||||||
from thinc.neural._classes.layernorm import LayerNorm as LN
|
|
||||||
from thinc.neural._classes.resnet import Residual
|
|
||||||
from thinc.neural import ReLu
|
|
||||||
from thinc.neural._classes.selu import SELU
|
|
||||||
from thinc import describe
|
from thinc import describe
|
||||||
from thinc.describe import Dimension, Synapses, Biases, Gradient
|
from thinc.describe import Dimension, Synapses, Biases, Gradient
|
||||||
from thinc.neural._classes.affine import _set_dimensions_if_needed
|
from thinc.neural._classes.affine import _set_dimensions_if_needed
|
||||||
from thinc.api import FeatureExtracter, with_getitem
|
import thinc.extra.load_nlp
|
||||||
from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool
|
|
||||||
from thinc.neural._classes.attention import ParametricAttention
|
|
||||||
from thinc.linear.linear import LinearModel
|
|
||||||
from thinc.api import uniqued, wrap, flatten_add_lengths
|
|
||||||
|
|
||||||
|
|
||||||
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP, CLUSTER
|
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP, CLUSTER
|
||||||
from .tokens.doc import Doc
|
from .tokens.doc import Doc
|
||||||
|
@ -31,6 +30,11 @@ from . import util
|
||||||
import numpy
|
import numpy
|
||||||
import io
|
import io
|
||||||
|
|
||||||
|
# TODO: Unset this once we don't want to support models previous models.
|
||||||
|
import thinc.neural._classes.layernorm
|
||||||
|
thinc.neural._classes.layernorm.set_compat_six_eight(True)
|
||||||
|
|
||||||
|
VECTORS_KEY = 'spacy_pretrained_vectors'
|
||||||
|
|
||||||
@layerize
|
@layerize
|
||||||
def _flatten_add_lengths(seqs, pad=0, drop=0.):
|
def _flatten_add_lengths(seqs, pad=0, drop=0.):
|
||||||
|
@ -225,33 +229,80 @@ def drop_layer(layer, factor=2.):
|
||||||
model.predict = layer
|
model.predict = layer
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
def link_vectors_to_models(vocab):
|
||||||
|
vectors = vocab.vectors
|
||||||
|
ops = Model.ops
|
||||||
|
for word in vocab:
|
||||||
|
if word.orth in vectors.key2row:
|
||||||
|
word.rank = vectors.key2row[word.orth]
|
||||||
|
else:
|
||||||
|
word.rank = 0
|
||||||
|
data = ops.asarray(vectors.data)
|
||||||
|
# Set an entry here, so that vectors are accessed by StaticVectors
|
||||||
|
# (unideal, I know)
|
||||||
|
thinc.extra.load_nlp.VECTORS[(ops.device, VECTORS_KEY)] = data
|
||||||
|
|
||||||
def Tok2Vec(width, embed_size, preprocess=None):
|
def Tok2Vec(width, embed_size, **kwargs):
|
||||||
|
pretrained_dims = kwargs.get('pretrained_dims', 0)
|
||||||
|
cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 3)
|
||||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||||
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
|
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add,
|
||||||
|
'*': reapply}):
|
||||||
norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm')
|
norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm')
|
||||||
prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix')
|
prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix')
|
||||||
suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix')
|
suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix')
|
||||||
shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape')
|
shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape')
|
||||||
|
if pretrained_dims is not None and pretrained_dims >= 1:
|
||||||
|
glove = StaticVectors(VECTORS_KEY, width, column=cols.index(ID))
|
||||||
|
|
||||||
embed = (norm | prefix | suffix | shape ) >> LN(Maxout(width, width*4, pieces=3))
|
embed = uniqued(
|
||||||
tok2vec = (
|
(glove | norm | prefix | suffix | shape)
|
||||||
with_flatten(
|
>> LN(Maxout(width, width*5, pieces=3)), column=5)
|
||||||
asarray(Model.ops, dtype='uint64')
|
else:
|
||||||
>> uniqued(embed, column=5)
|
embed = uniqued(
|
||||||
>> Residual(
|
(norm | prefix | suffix | shape)
|
||||||
(ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
|
>> LN(Maxout(width, width*4, pieces=3)), column=5)
|
||||||
) ** 4, pad=4
|
|
||||||
)
|
|
||||||
|
convolution = Residual(
|
||||||
|
ExtractWindow(nW=1)
|
||||||
|
>> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces))
|
||||||
)
|
)
|
||||||
if preprocess not in (False, None):
|
|
||||||
tok2vec = preprocess >> tok2vec
|
tok2vec = (
|
||||||
|
FeatureExtracter(cols)
|
||||||
|
>> with_flatten(
|
||||||
|
embed >> (convolution ** 4), pad=4)
|
||||||
|
)
|
||||||
|
|
||||||
# Work around thinc API limitations :(. TODO: Revise in Thinc 7
|
# Work around thinc API limitations :(. TODO: Revise in Thinc 7
|
||||||
tok2vec.nO = width
|
tok2vec.nO = width
|
||||||
tok2vec.embed = embed
|
tok2vec.embed = embed
|
||||||
return tok2vec
|
return tok2vec
|
||||||
|
|
||||||
|
|
||||||
|
def reapply(layer, n_times):
|
||||||
|
def reapply_fwd(X, drop=0.):
|
||||||
|
backprops = []
|
||||||
|
for i in range(n_times):
|
||||||
|
Y, backprop = layer.begin_update(X, drop=drop)
|
||||||
|
X = Y
|
||||||
|
backprops.append(backprop)
|
||||||
|
def reapply_bwd(dY, sgd=None):
|
||||||
|
dX = None
|
||||||
|
for backprop in reversed(backprops):
|
||||||
|
dY = backprop(dY, sgd=sgd)
|
||||||
|
if dX is None:
|
||||||
|
dX = dY
|
||||||
|
else:
|
||||||
|
dX += dY
|
||||||
|
return dX
|
||||||
|
return Y, reapply_bwd
|
||||||
|
return wrap(reapply_fwd, layer)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def asarray(ops, dtype):
|
def asarray(ops, dtype):
|
||||||
def forward(X, drop=0.):
|
def forward(X, drop=0.):
|
||||||
return ops.asarray(X, dtype=dtype), None
|
return ops.asarray(X, dtype=dtype), None
|
||||||
|
@ -455,20 +506,25 @@ def getitem(i):
|
||||||
return X[i], None
|
return X[i], None
|
||||||
return layerize(getitem_fwd)
|
return layerize(getitem_fwd)
|
||||||
|
|
||||||
def build_tagger_model(nr_class, token_vector_width, **cfg):
|
def build_tagger_model(nr_class, **cfg):
|
||||||
embed_size = util.env_opt('embed_size', 7500)
|
embed_size = util.env_opt('embed_size', 7000)
|
||||||
|
if 'token_vector_width' in cfg:
|
||||||
|
token_vector_width = cfg['token_vector_width']
|
||||||
|
else:
|
||||||
|
token_vector_width = util.env_opt('token_vector_width', 128)
|
||||||
|
pretrained_dims = cfg.get('pretrained_dims', 0)
|
||||||
with Model.define_operators({'>>': chain, '+': add}):
|
with Model.define_operators({'>>': chain, '+': add}):
|
||||||
# Input: (doc, tensor) tuples
|
if 'tok2vec' in cfg:
|
||||||
private_tok2vec = Tok2Vec(token_vector_width, embed_size, preprocess=doc2feats())
|
tok2vec = cfg['tok2vec']
|
||||||
|
else:
|
||||||
|
tok2vec = Tok2Vec(token_vector_width, embed_size,
|
||||||
|
pretrained_dims=pretrained_dims)
|
||||||
model = (
|
model = (
|
||||||
fine_tune(private_tok2vec)
|
tok2vec
|
||||||
>> with_flatten(
|
>> with_flatten(Softmax(nr_class, token_vector_width))
|
||||||
Maxout(token_vector_width, token_vector_width)
|
|
||||||
>> Softmax(nr_class, token_vector_width)
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
model.nI = None
|
model.nI = None
|
||||||
|
model.tok2vec = tok2vec
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
@ -514,6 +570,7 @@ def foreach(layer, drop_factor=1.0):
|
||||||
|
|
||||||
def build_text_classifier(nr_class, width=64, **cfg):
|
def build_text_classifier(nr_class, width=64, **cfg):
|
||||||
nr_vector = cfg.get('nr_vector', 5000)
|
nr_vector = cfg.get('nr_vector', 5000)
|
||||||
|
pretrained_dims = cfg.get('pretrained_dims', 0)
|
||||||
with Model.define_operators({'>>': chain, '+': add, '|': concatenate,
|
with Model.define_operators({'>>': chain, '+': add, '|': concatenate,
|
||||||
'**': clone}):
|
'**': clone}):
|
||||||
if cfg.get('low_data'):
|
if cfg.get('low_data'):
|
||||||
|
@ -521,7 +578,7 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
||||||
SpacyVectors
|
SpacyVectors
|
||||||
>> flatten_add_lengths
|
>> flatten_add_lengths
|
||||||
>> with_getitem(0,
|
>> with_getitem(0,
|
||||||
Affine(width, 300)
|
Affine(width, pretrained_dims)
|
||||||
)
|
)
|
||||||
>> ParametricAttention(width)
|
>> ParametricAttention(width)
|
||||||
>> Pooling(sum_pool)
|
>> Pooling(sum_pool)
|
||||||
|
@ -548,18 +605,24 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
static_vectors = (
|
if pretrained_dims:
|
||||||
SpacyVectors
|
static_vectors = (
|
||||||
>> with_flatten(Affine(width, 300))
|
SpacyVectors
|
||||||
)
|
>> with_flatten(Affine(width, pretrained_dims))
|
||||||
|
)
|
||||||
cnn_model = (
|
|
||||||
# TODO Make concatenate support lists
|
# TODO Make concatenate support lists
|
||||||
concatenate_lists(trained_vectors, static_vectors)
|
vectors = concatenate_lists(trained_vectors, static_vectors)
|
||||||
|
vectors_width = width*2
|
||||||
|
else:
|
||||||
|
vectors = trained_vectors
|
||||||
|
vectors_width = width
|
||||||
|
static_vectors = None
|
||||||
|
cnn_model = (
|
||||||
|
vectors
|
||||||
>> with_flatten(
|
>> with_flatten(
|
||||||
LN(Maxout(width, width*2))
|
LN(Maxout(width, vectors_width))
|
||||||
>> Residual(
|
>> Residual(
|
||||||
(ExtractWindow(nW=1) >> zero_init(Maxout(width, width*3)))
|
(ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
|
||||||
) ** 2, pad=2
|
) ** 2, pad=2
|
||||||
)
|
)
|
||||||
>> flatten_add_lengths
|
>> flatten_add_lengths
|
||||||
|
@ -579,7 +642,7 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
||||||
>> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0))
|
>> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0))
|
||||||
>> logistic
|
>> logistic
|
||||||
)
|
)
|
||||||
|
model.nO = nr_class
|
||||||
model.lsuv = False
|
model.lsuv = False
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
|
@ -3,14 +3,15 @@
|
||||||
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
||||||
|
|
||||||
__title__ = 'spacy-nightly'
|
__title__ = 'spacy-nightly'
|
||||||
__version__ = '2.0.0a13'
|
__version__ = '2.0.0a16'
|
||||||
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
|
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
|
||||||
__uri__ = 'https://spacy.io'
|
__uri__ = 'https://spacy.io'
|
||||||
__author__ = 'Explosion AI'
|
__author__ = 'Explosion AI'
|
||||||
__email__ = 'contact@explosion.ai'
|
__email__ = 'contact@explosion.ai'
|
||||||
__license__ = 'MIT'
|
__license__ = 'MIT'
|
||||||
|
__release__ = True
|
||||||
|
|
||||||
__docs_models__ = 'https://spacy.io/docs/usage/models'
|
__docs_models__ = 'https://alpha.spacy.io/usage/models'
|
||||||
__download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
|
__download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
|
||||||
__compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json'
|
__compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json'
|
||||||
__shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts.json'
|
__shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts.json'
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# Reserve 64 values for flag features
|
# Reserve 64 values for flag features
|
||||||
cpdef enum attr_id_t:
|
cdef enum attr_id_t:
|
||||||
NULL_ATTR
|
NULL_ATTR
|
||||||
IS_ALPHA
|
IS_ALPHA
|
||||||
IS_ASCII
|
IS_ASCII
|
||||||
|
|
|
@ -94,6 +94,7 @@ IDS = {
|
||||||
|
|
||||||
# ATTR IDs, in order of the symbol
|
# ATTR IDs, in order of the symbol
|
||||||
NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
|
NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
|
||||||
|
locals().update(IDS)
|
||||||
|
|
||||||
|
|
||||||
def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
||||||
|
|
|
@ -4,5 +4,6 @@ from .link import link
|
||||||
from .package import package
|
from .package import package
|
||||||
from .profile import profile
|
from .profile import profile
|
||||||
from .train import train
|
from .train import train
|
||||||
|
from .evaluate import evaluate
|
||||||
from .convert import convert
|
from .convert import convert
|
||||||
from .model import model
|
from .model import model
|
||||||
|
|
|
@ -14,7 +14,7 @@ from ..util import prints
|
||||||
CONVERTERS = {
|
CONVERTERS = {
|
||||||
'.conllu': conllu2json,
|
'.conllu': conllu2json,
|
||||||
'.conll': conllu2json,
|
'.conll': conllu2json,
|
||||||
'.iob': iob2json
|
'.iob': iob2json,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
from cytoolz import partition_all, concat
|
||||||
|
|
||||||
from ...compat import json_dumps, path2str
|
from ...compat import json_dumps, path2str
|
||||||
from ...util import prints
|
from ...util import prints
|
||||||
|
@ -10,11 +11,9 @@ def iob2json(input_path, output_path, n_sents=10, *a, **k):
|
||||||
"""
|
"""
|
||||||
Convert IOB files into JSON format for use with train cli.
|
Convert IOB files into JSON format for use with train cli.
|
||||||
"""
|
"""
|
||||||
# TODO: This isn't complete yet -- need to map from IOB to
|
|
||||||
# BILUO
|
|
||||||
with input_path.open('r', encoding='utf8') as file_:
|
with input_path.open('r', encoding='utf8') as file_:
|
||||||
docs = read_iob(file_)
|
sentences = read_iob(file_)
|
||||||
|
docs = merge_sentences(sentences, n_sents)
|
||||||
output_filename = input_path.parts[-1].replace(".iob", ".json")
|
output_filename = input_path.parts[-1].replace(".iob", ".json")
|
||||||
output_file = output_path / output_filename
|
output_file = output_path / output_filename
|
||||||
with output_file.open('w', encoding='utf-8') as f:
|
with output_file.open('w', encoding='utf-8') as f:
|
||||||
|
@ -23,9 +22,9 @@ def iob2json(input_path, output_path, n_sents=10, *a, **k):
|
||||||
title="Generated output file %s" % path2str(output_file))
|
title="Generated output file %s" % path2str(output_file))
|
||||||
|
|
||||||
|
|
||||||
def read_iob(file_):
|
def read_iob(raw_sents):
|
||||||
sentences = []
|
sentences = []
|
||||||
for line in file_:
|
for line in raw_sents:
|
||||||
if not line.strip():
|
if not line.strip():
|
||||||
continue
|
continue
|
||||||
tokens = [t.split('|') for t in line.split()]
|
tokens = [t.split('|') for t in line.split()]
|
||||||
|
@ -43,3 +42,15 @@ def read_iob(file_):
|
||||||
paragraphs = [{'sentences': [sent]} for sent in sentences]
|
paragraphs = [{'sentences': [sent]} for sent in sentences]
|
||||||
docs = [{'id': 0, 'paragraphs': [para]} for para in paragraphs]
|
docs = [{'id': 0, 'paragraphs': [para]} for para in paragraphs]
|
||||||
return docs
|
return docs
|
||||||
|
|
||||||
|
def merge_sentences(docs, n_sents):
|
||||||
|
counter = 0
|
||||||
|
merged = []
|
||||||
|
for group in partition_all(n_sents, docs):
|
||||||
|
group = list(group)
|
||||||
|
first = group.pop(0)
|
||||||
|
to_extend = first['paragraphs'][0]['sentences']
|
||||||
|
for sent in group[1:]:
|
||||||
|
to_extend.extend(sent['paragraphs'][0]['sentences'])
|
||||||
|
merged.append(first)
|
||||||
|
return merged
|
||||||
|
|
119
spacy/cli/evaluate.py
Normal file
119
spacy/cli/evaluate.py
Normal file
|
@ -0,0 +1,119 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals, division, print_function
|
||||||
|
|
||||||
|
import plac
|
||||||
|
import json
|
||||||
|
from collections import defaultdict
|
||||||
|
import cytoolz
|
||||||
|
from pathlib import Path
|
||||||
|
import dill
|
||||||
|
import tqdm
|
||||||
|
from thinc.neural._classes.model import Model
|
||||||
|
from thinc.neural.optimizers import linear_decay
|
||||||
|
from timeit import default_timer as timer
|
||||||
|
import random
|
||||||
|
import numpy.random
|
||||||
|
|
||||||
|
from ..tokens.doc import Doc
|
||||||
|
from ..scorer import Scorer
|
||||||
|
from ..gold import GoldParse, merge_sents
|
||||||
|
from ..gold import GoldCorpus, minibatch
|
||||||
|
from ..util import prints
|
||||||
|
from .. import util
|
||||||
|
from .. import about
|
||||||
|
from .. import displacy
|
||||||
|
from ..compat import json_dumps
|
||||||
|
|
||||||
|
random.seed(0)
|
||||||
|
numpy.random.seed(0)
|
||||||
|
|
||||||
|
|
||||||
|
@plac.annotations(
|
||||||
|
model=("Model name or path", "positional", None, str),
|
||||||
|
data_path=("Location of JSON-formatted evaluation data", "positional", None, str),
|
||||||
|
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||||
|
gpu_id=("Use GPU", "option", "g", int),
|
||||||
|
displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str),
|
||||||
|
displacy_limit=("Limit of parses to render as HTML", "option", "dl", int)
|
||||||
|
)
|
||||||
|
def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
|
||||||
|
displacy_path=None, displacy_limit=25):
|
||||||
|
"""
|
||||||
|
Evaluate a model. To render a sample of parses in a HTML file, set an output
|
||||||
|
directory as the displacy_path argument.
|
||||||
|
"""
|
||||||
|
util.use_gpu(gpu_id)
|
||||||
|
util.set_env_log(False)
|
||||||
|
data_path = util.ensure_path(data_path)
|
||||||
|
displacy_path = util.ensure_path(displacy_path)
|
||||||
|
if not data_path.exists():
|
||||||
|
prints(data_path, title="Evaluation data not found", exits=1)
|
||||||
|
if displacy_path and not displacy_path.exists():
|
||||||
|
prints(displacy_path, title="Visualization output directory not found", exits=1)
|
||||||
|
corpus = GoldCorpus(data_path, data_path)
|
||||||
|
nlp = util.load_model(model)
|
||||||
|
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
|
||||||
|
begin = timer()
|
||||||
|
scorer = nlp.evaluate(dev_docs, verbose=False)
|
||||||
|
end = timer()
|
||||||
|
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
|
||||||
|
print_results(scorer, time=end - begin, words=nwords,
|
||||||
|
wps=nwords / (end - begin))
|
||||||
|
if displacy_path:
|
||||||
|
docs, golds = zip(*dev_docs)
|
||||||
|
render_deps = 'parser' in nlp.meta.get('pipeline', [])
|
||||||
|
render_ents = 'ner' in nlp.meta.get('pipeline', [])
|
||||||
|
render_parses(docs, displacy_path, model_name=model, limit=displacy_limit,
|
||||||
|
deps=render_deps, ents=render_ents)
|
||||||
|
prints(displacy_path, title="Generated %s parses as HTML" % displacy_limit)
|
||||||
|
|
||||||
|
|
||||||
|
def render_parses(docs, output_path, model_name='', limit=250, deps=True, ents=True):
|
||||||
|
docs[0].user_data['title'] = model_name
|
||||||
|
if ents:
|
||||||
|
with (output_path / 'entities.html').open('w') as file_:
|
||||||
|
html = displacy.render(docs[:limit], style='ent', page=True)
|
||||||
|
file_.write(html)
|
||||||
|
if deps:
|
||||||
|
with (output_path / 'parses.html').open('w') as file_:
|
||||||
|
html = displacy.render(docs[:limit], style='dep', page=True, options={'compact': True})
|
||||||
|
file_.write(html)
|
||||||
|
|
||||||
|
|
||||||
|
def print_progress(itn, losses, dev_scores, wps=0.0):
|
||||||
|
scores = {}
|
||||||
|
for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
|
||||||
|
'ents_p', 'ents_r', 'ents_f', 'wps']:
|
||||||
|
scores[col] = 0.0
|
||||||
|
scores['dep_loss'] = losses.get('parser', 0.0)
|
||||||
|
scores['ner_loss'] = losses.get('ner', 0.0)
|
||||||
|
scores['tag_loss'] = losses.get('tagger', 0.0)
|
||||||
|
scores.update(dev_scores)
|
||||||
|
scores['wps'] = wps
|
||||||
|
tpl = '\t'.join((
|
||||||
|
'{:d}',
|
||||||
|
'{dep_loss:.3f}',
|
||||||
|
'{ner_loss:.3f}',
|
||||||
|
'{uas:.3f}',
|
||||||
|
'{ents_p:.3f}',
|
||||||
|
'{ents_r:.3f}',
|
||||||
|
'{ents_f:.3f}',
|
||||||
|
'{tags_acc:.3f}',
|
||||||
|
'{token_acc:.3f}',
|
||||||
|
'{wps:.1f}'))
|
||||||
|
print(tpl.format(itn, **scores))
|
||||||
|
|
||||||
|
|
||||||
|
def print_results(scorer, time, words, wps):
|
||||||
|
results = {
|
||||||
|
'Time': '%.2f s' % time,
|
||||||
|
'Words': words,
|
||||||
|
'Words/s': '%.0f' % wps,
|
||||||
|
'TOK': '%.2f' % scorer.token_acc,
|
||||||
|
'POS': '%.2f' % scorer.tags_acc,
|
||||||
|
'UAS': '%.2f' % scorer.uas,
|
||||||
|
'LAS': '%.2f' % scorer.las,
|
||||||
|
'NER P': '%.2f' % scorer.ents_p,
|
||||||
|
'NER R': '%.2f' % scorer.ents_r,
|
||||||
|
'NER F': '%.2f' % scorer.ents_f}
|
||||||
|
util.print_table(results, title="Results")
|
|
@ -105,8 +105,11 @@ def generate_pipeline():
|
||||||
"parser, ner. For more information, see the docs on processing pipelines.",
|
"parser, ner. For more information, see the docs on processing pipelines.",
|
||||||
title="Enter your model's pipeline components")
|
title="Enter your model's pipeline components")
|
||||||
pipeline = util.get_raw_input("Pipeline components", True)
|
pipeline = util.get_raw_input("Pipeline components", True)
|
||||||
replace = {'True': True, 'False': False}
|
subs = {'True': True, 'False': False}
|
||||||
return replace[pipeline] if pipeline in replace else pipeline.split(', ')
|
if pipeline in subs:
|
||||||
|
return subs[pipeline]
|
||||||
|
else:
|
||||||
|
return [p.strip() for p in pipeline.split(',')]
|
||||||
|
|
||||||
|
|
||||||
def validate_meta(meta, keys):
|
def validate_meta(meta, keys):
|
||||||
|
|
|
@ -8,8 +8,11 @@ import cytoolz
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import dill
|
import dill
|
||||||
import tqdm
|
import tqdm
|
||||||
|
from thinc.neural._classes.model import Model
|
||||||
from thinc.neural.optimizers import linear_decay
|
from thinc.neural.optimizers import linear_decay
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
|
import random
|
||||||
|
import numpy.random
|
||||||
|
|
||||||
from ..tokens.doc import Doc
|
from ..tokens.doc import Doc
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
|
@ -17,9 +20,13 @@ from ..gold import GoldParse, merge_sents
|
||||||
from ..gold import GoldCorpus, minibatch
|
from ..gold import GoldCorpus, minibatch
|
||||||
from ..util import prints
|
from ..util import prints
|
||||||
from .. import util
|
from .. import util
|
||||||
|
from .. import about
|
||||||
from .. import displacy
|
from .. import displacy
|
||||||
from ..compat import json_dumps
|
from ..compat import json_dumps
|
||||||
|
|
||||||
|
random.seed(0)
|
||||||
|
numpy.random.seed(0)
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
lang=("model language", "positional", None, str),
|
lang=("model language", "positional", None, str),
|
||||||
|
@ -29,15 +36,17 @@ from ..compat import json_dumps
|
||||||
n_iter=("number of iterations", "option", "n", int),
|
n_iter=("number of iterations", "option", "n", int),
|
||||||
n_sents=("number of sentences", "option", "ns", int),
|
n_sents=("number of sentences", "option", "ns", int),
|
||||||
use_gpu=("Use GPU", "option", "g", int),
|
use_gpu=("Use GPU", "option", "g", int),
|
||||||
resume=("Whether to resume training", "flag", "R", bool),
|
vectors=("Model to load vectors from", "option", "v"),
|
||||||
no_tagger=("Don't train tagger", "flag", "T", bool),
|
no_tagger=("Don't train tagger", "flag", "T", bool),
|
||||||
no_parser=("Don't train parser", "flag", "P", bool),
|
no_parser=("Don't train parser", "flag", "P", bool),
|
||||||
no_entities=("Don't train NER", "flag", "N", bool),
|
no_entities=("Don't train NER", "flag", "N", bool),
|
||||||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||||
|
version=("Model version", "option", "V", str),
|
||||||
|
meta_path=("Optional path to meta.json. All relevant properties will be overwritten.", "option", "m", Path)
|
||||||
)
|
)
|
||||||
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
|
||||||
use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False,
|
use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False,
|
||||||
gold_preproc=False):
|
gold_preproc=False, version="0.0.0", meta_path=None):
|
||||||
"""
|
"""
|
||||||
Train a model. Expects data in spaCy's JSON format.
|
Train a model. Expects data in spaCy's JSON format.
|
||||||
"""
|
"""
|
||||||
|
@ -46,19 +55,24 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||||
output_path = util.ensure_path(output_dir)
|
output_path = util.ensure_path(output_dir)
|
||||||
train_path = util.ensure_path(train_data)
|
train_path = util.ensure_path(train_data)
|
||||||
dev_path = util.ensure_path(dev_data)
|
dev_path = util.ensure_path(dev_data)
|
||||||
|
meta_path = util.ensure_path(meta_path)
|
||||||
if not output_path.exists():
|
if not output_path.exists():
|
||||||
output_path.mkdir()
|
output_path.mkdir()
|
||||||
if not train_path.exists():
|
if not train_path.exists():
|
||||||
prints(train_path, title="Training data not found", exits=1)
|
prints(train_path, title="Training data not found", exits=1)
|
||||||
if dev_path and not dev_path.exists():
|
if dev_path and not dev_path.exists():
|
||||||
prints(dev_path, title="Development data not found", exits=1)
|
prints(dev_path, title="Development data not found", exits=1)
|
||||||
|
if meta_path is not None and not meta_path.exists():
|
||||||
|
prints(meta_path, title="meta.json not found", exits=1)
|
||||||
|
meta = util.read_json(meta_path) if meta_path else {}
|
||||||
|
if not isinstance(meta, dict):
|
||||||
|
prints("Expected dict but got: {}".format(type(meta)),
|
||||||
|
title="Not a valid meta.json format", exits=1)
|
||||||
|
|
||||||
lang_class = util.get_lang_class(lang)
|
pipeline = ['tagger', 'parser', 'ner']
|
||||||
|
if no_tagger and 'tagger' in pipeline: pipeline.remove('tagger')
|
||||||
pipeline = ['token_vectors', 'tags', 'dependencies', 'entities']
|
if no_parser and 'parser' in pipeline: pipeline.remove('parser')
|
||||||
if no_tagger and 'tags' in pipeline: pipeline.remove('tags')
|
if no_entities and 'ner' in pipeline: pipeline.remove('ner')
|
||||||
if no_parser and 'dependencies' in pipeline: pipeline.remove('dependencies')
|
|
||||||
if no_entities and 'entities' in pipeline: pipeline.remove('entities')
|
|
||||||
|
|
||||||
# Take dropout and batch size as generators of values -- dropout
|
# Take dropout and batch size as generators of values -- dropout
|
||||||
# starts high and decays sharply, to force the optimizer to explore.
|
# starts high and decays sharply, to force the optimizer to explore.
|
||||||
|
@ -68,33 +82,30 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||||
util.env_opt('dropout_to', 0.2),
|
util.env_opt('dropout_to', 0.2),
|
||||||
util.env_opt('dropout_decay', 0.0))
|
util.env_opt('dropout_decay', 0.0))
|
||||||
batch_sizes = util.compounding(util.env_opt('batch_from', 1),
|
batch_sizes = util.compounding(util.env_opt('batch_from', 1),
|
||||||
util.env_opt('batch_to', 64),
|
util.env_opt('batch_to', 16),
|
||||||
util.env_opt('batch_compound', 1.001))
|
util.env_opt('batch_compound', 1.001))
|
||||||
|
|
||||||
if resume:
|
|
||||||
prints(output_path / 'model9.pickle', title="Resuming training")
|
|
||||||
nlp = dill.load((output_path / 'model9.pickle').open('rb'))
|
|
||||||
else:
|
|
||||||
nlp = lang_class(pipeline=pipeline)
|
|
||||||
corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
|
corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
|
||||||
n_train_words = corpus.count_train()
|
n_train_words = corpus.count_train()
|
||||||
|
|
||||||
|
lang_class = util.get_lang_class(lang)
|
||||||
|
nlp = lang_class(pipeline=pipeline)
|
||||||
|
if vectors:
|
||||||
|
util.load_model(vectors, vocab=nlp.vocab)
|
||||||
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
|
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
|
||||||
|
nlp._optimizer = None
|
||||||
|
|
||||||
print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
|
print("Itn.\tP.Loss\tN.Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
|
||||||
try:
|
try:
|
||||||
|
train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0,
|
||||||
|
gold_preproc=gold_preproc, max_length=0)
|
||||||
|
train_docs = list(train_docs)
|
||||||
for i in range(n_iter):
|
for i in range(n_iter):
|
||||||
if resume:
|
|
||||||
i += 20
|
|
||||||
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
||||||
train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0,
|
|
||||||
gold_preproc=gold_preproc, max_length=0)
|
|
||||||
losses = {}
|
losses = {}
|
||||||
for batch in minibatch(train_docs, size=batch_sizes):
|
for batch in minibatch(train_docs, size=batch_sizes):
|
||||||
docs, golds = zip(*batch)
|
docs, golds = zip(*batch)
|
||||||
nlp.update(docs, golds, sgd=optimizer,
|
nlp.update(docs, golds, sgd=optimizer,
|
||||||
drop=next(dropout_rates), losses=losses,
|
drop=next(dropout_rates), losses=losses)
|
||||||
update_shared=True)
|
|
||||||
pbar.update(sum(len(doc) for doc in docs))
|
pbar.update(sum(len(doc) for doc in docs))
|
||||||
|
|
||||||
with nlp.use_params(optimizer.averages):
|
with nlp.use_params(optimizer.averages):
|
||||||
|
@ -104,12 +115,22 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||||
nlp_loaded = lang_class(pipeline=pipeline)
|
nlp_loaded = lang_class(pipeline=pipeline)
|
||||||
nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
|
nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
|
||||||
scorer = nlp_loaded.evaluate(
|
scorer = nlp_loaded.evaluate(
|
||||||
corpus.dev_docs(
|
list(corpus.dev_docs(
|
||||||
nlp_loaded,
|
nlp_loaded,
|
||||||
gold_preproc=gold_preproc))
|
gold_preproc=gold_preproc)))
|
||||||
acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
|
acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
|
||||||
with acc_loc.open('w') as file_:
|
with acc_loc.open('w') as file_:
|
||||||
file_.write(json_dumps(scorer.scores))
|
file_.write(json_dumps(scorer.scores))
|
||||||
|
meta_loc = output_path / ('model%d' % i) / 'meta.json'
|
||||||
|
meta['accuracy'] = scorer.scores
|
||||||
|
meta['lang'] = nlp.lang
|
||||||
|
meta['pipeline'] = pipeline
|
||||||
|
meta['spacy_version'] = '>=%s' % about.__version__
|
||||||
|
meta.setdefault('name', 'model%d' % i)
|
||||||
|
meta.setdefault('version', version)
|
||||||
|
|
||||||
|
with meta_loc.open('w') as file_:
|
||||||
|
file_.write(json_dumps(meta))
|
||||||
util.set_env_log(True)
|
util.set_env_log(True)
|
||||||
print_progress(i, losses, scorer.scores)
|
print_progress(i, losses, scorer.scores)
|
||||||
finally:
|
finally:
|
||||||
|
@ -138,12 +159,14 @@ def print_progress(itn, losses, dev_scores, wps=0.0):
|
||||||
'ents_p', 'ents_r', 'ents_f', 'wps']:
|
'ents_p', 'ents_r', 'ents_f', 'wps']:
|
||||||
scores[col] = 0.0
|
scores[col] = 0.0
|
||||||
scores['dep_loss'] = losses.get('parser', 0.0)
|
scores['dep_loss'] = losses.get('parser', 0.0)
|
||||||
|
scores['ner_loss'] = losses.get('ner', 0.0)
|
||||||
scores['tag_loss'] = losses.get('tagger', 0.0)
|
scores['tag_loss'] = losses.get('tagger', 0.0)
|
||||||
scores.update(dev_scores)
|
scores.update(dev_scores)
|
||||||
scores['wps'] = wps
|
scores['wps'] = wps
|
||||||
tpl = '\t'.join((
|
tpl = '\t'.join((
|
||||||
'{:d}',
|
'{:d}',
|
||||||
'{dep_loss:.3f}',
|
'{dep_loss:.3f}',
|
||||||
|
'{ner_loss:.3f}',
|
||||||
'{uas:.3f}',
|
'{uas:.3f}',
|
||||||
'{ents_p:.3f}',
|
'{ents_p:.3f}',
|
||||||
'{ents_r:.3f}',
|
'{ents_r:.3f}',
|
||||||
|
|
|
@ -7,6 +7,7 @@ import re
|
||||||
import ujson
|
import ujson
|
||||||
import random
|
import random
|
||||||
import cytoolz
|
import cytoolz
|
||||||
|
import itertools
|
||||||
|
|
||||||
from .syntax import nonproj
|
from .syntax import nonproj
|
||||||
from .util import ensure_path
|
from .util import ensure_path
|
||||||
|
@ -146,9 +147,13 @@ def minibatch(items, size=8):
|
||||||
'''Iterate over batches of items. `size` may be an iterator,
|
'''Iterate over batches of items. `size` may be an iterator,
|
||||||
so that batch-size can vary on each step.
|
so that batch-size can vary on each step.
|
||||||
'''
|
'''
|
||||||
|
if isinstance(size, int):
|
||||||
|
size_ = itertools.repeat(8)
|
||||||
|
else:
|
||||||
|
size_ = size
|
||||||
items = iter(items)
|
items = iter(items)
|
||||||
while True:
|
while True:
|
||||||
batch_size = next(size) #if hasattr(size, '__next__') else size
|
batch_size = next(size_)
|
||||||
batch = list(cytoolz.take(int(batch_size), items))
|
batch = list(cytoolz.take(int(batch_size), items))
|
||||||
if len(batch) == 0:
|
if len(batch) == 0:
|
||||||
break
|
break
|
||||||
|
|
|
@ -29,9 +29,9 @@ _units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm
|
||||||
'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb '
|
'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb '
|
||||||
'TB T G M K %')
|
'TB T G M K %')
|
||||||
_currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
|
_currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
|
||||||
_punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &'
|
_punct = r'… …… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ ·'
|
||||||
_quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «'
|
_quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «'
|
||||||
_hyphens = '- – — -- ---'
|
_hyphens = '- – — -- --- —— ~'
|
||||||
_other_symbols = r'[\p{So}]'
|
_other_symbols = r'[\p{So}]'
|
||||||
|
|
||||||
UNITS = merge_chars(_units)
|
UNITS = merge_chars(_units)
|
||||||
|
|
|
@ -3,6 +3,7 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .norm_exceptions import NORM_EXCEPTIONS
|
from .norm_exceptions import NORM_EXCEPTIONS
|
||||||
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lemmatizer import LOOKUP
|
from .lemmatizer import LOOKUP
|
||||||
|
@ -23,6 +24,7 @@ class GermanDefaults(Language.Defaults):
|
||||||
NORM_EXCEPTIONS, BASE_NORMS)
|
NORM_EXCEPTIONS, BASE_NORMS)
|
||||||
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
|
infixes = tuple(TOKENIZER_INFIXES)
|
||||||
tag_map = dict(TAG_MAP)
|
tag_map = dict(TAG_MAP)
|
||||||
stop_words = set(STOP_WORDS)
|
stop_words = set(STOP_WORDS)
|
||||||
syntax_iterators = dict(SYNTAX_ITERATORS)
|
syntax_iterators = dict(SYNTAX_ITERATORS)
|
||||||
|
|
20
spacy/lang/de/punctuation.py
Normal file
20
spacy/lang/de/punctuation.py
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
|
||||||
|
from ..char_classes import QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||||
|
|
||||||
|
|
||||||
|
_quotes = QUOTES.replace("'", '')
|
||||||
|
|
||||||
|
_infixes = (LIST_ELLIPSES + LIST_ICONS +
|
||||||
|
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
||||||
|
r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA),
|
||||||
|
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
|
||||||
|
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
||||||
|
r'(?<=[{a}])([{q}\)\]\(\[])(?=[\{a}])'.format(a=ALPHA, q=_quotes),
|
||||||
|
r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA),
|
||||||
|
r'(?<=[0-9])-(?=[0-9])'])
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_INFIXES = _infixes
|
|
@ -4,6 +4,7 @@ from __future__ import unicode_literals
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
||||||
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .lemmatizer import LOOKUP
|
from .lemmatizer import LOOKUP
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
|
||||||
|
@ -17,6 +18,7 @@ from ...util import update_exc, add_lookups
|
||||||
|
|
||||||
class FrenchDefaults(Language.Defaults):
|
class FrenchDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
lex_attr_getters[LANG] = lambda text: 'fr'
|
lex_attr_getters[LANG] = lambda text: 'fr'
|
||||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||||
|
|
||||||
|
|
41
spacy/lang/fr/lex_attrs.py
Normal file
41
spacy/lang/fr/lex_attrs.py
Normal file
|
@ -0,0 +1,41 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
|
_num_words = set("""
|
||||||
|
zero un deux trois quatre cinq six sept huit neuf dix
|
||||||
|
onze douze treize quatorze quinze seize dix-sept dix-huit dix-neuf
|
||||||
|
vingt trente quanrante cinquante soixante septante quatre-vingt huitante nonante
|
||||||
|
cent mille mil million milliard billion quadrillion quintillion
|
||||||
|
sextillion septillion octillion nonillion decillion
|
||||||
|
""".split())
|
||||||
|
|
||||||
|
_ordinal_words = set("""
|
||||||
|
premier deuxième second troisième quatrième cinquième sixième septième huitième neuvième dixième
|
||||||
|
onzième douzième treizième quatorzième quinzième seizième dix-septième dix-huitième dix-neufième
|
||||||
|
vingtième trentième quanrantième cinquantième soixantième septantième quatre-vingtième huitantième nonantième
|
||||||
|
centième millième millionnième milliardième billionnième quadrillionnième quintillionnième
|
||||||
|
sextillionnième septillionnième octillionnième nonillionnième decillionnième
|
||||||
|
""".split())
|
||||||
|
|
||||||
|
|
||||||
|
def like_num(text):
|
||||||
|
# Might require more work?
|
||||||
|
# See this discussion: https://github.com/explosion/spaCy/pull/1161
|
||||||
|
text = text.replace(',', '').replace('.', '')
|
||||||
|
if text.isdigit():
|
||||||
|
return True
|
||||||
|
if text.count('/') == 1:
|
||||||
|
num, denom = text.split('/')
|
||||||
|
if num.isdigit() and denom.isdigit():
|
||||||
|
return True
|
||||||
|
if text in _num_words:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
LEX_ATTRS = {
|
||||||
|
LIKE_NUM: like_num
|
||||||
|
}
|
|
@ -2,6 +2,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from ..norm_exceptions import BASE_NORMS
|
||||||
|
@ -12,6 +13,7 @@ from ...util import update_exc, add_lookups
|
||||||
|
|
||||||
class DutchDefaults(Language.Defaults):
|
class DutchDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
lex_attr_getters[LANG] = lambda text: 'nl'
|
lex_attr_getters[LANG] = lambda text: 'nl'
|
||||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||||
|
|
||||||
|
|
40
spacy/lang/nl/lex_attrs.py
Normal file
40
spacy/lang/nl/lex_attrs.py
Normal file
|
@ -0,0 +1,40 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
|
_num_words = set("""
|
||||||
|
nul een één twee drie vier vijf zes zeven acht negen tien elf twaalf dertien
|
||||||
|
veertien twintig dertig veertig vijftig zestig zeventig tachtig negentig honderd
|
||||||
|
duizend miljoen miljard biljoen biljard triljoen triljard
|
||||||
|
""".split())
|
||||||
|
|
||||||
|
_ordinal_words = set("""
|
||||||
|
eerste tweede derde vierde vijfde zesde zevende achtste negende tiende elfde
|
||||||
|
twaalfde dertiende veertiende twintigste dertigste veertigste vijftigste
|
||||||
|
zestigste zeventigste tachtigste negentigste honderdste duizendste miljoenste
|
||||||
|
miljardste biljoenste biljardste triljoenste triljardste
|
||||||
|
""".split())
|
||||||
|
|
||||||
|
|
||||||
|
def like_num(text):
|
||||||
|
# This only does the most basic check for whether a token is a digit
|
||||||
|
# or matches one of the number words. In order to handle numbers like
|
||||||
|
# "drieëntwintig", more work is required.
|
||||||
|
# See this discussion: https://github.com/explosion/spaCy/pull/1177
|
||||||
|
text = text.replace(',', '').replace('.', '')
|
||||||
|
if text.isdigit():
|
||||||
|
return True
|
||||||
|
if text.count('/') == 1:
|
||||||
|
num, denom = text.split('/')
|
||||||
|
if num.isdigit() and denom.isdigit():
|
||||||
|
return True
|
||||||
|
if text in _num_words:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
LEX_ATTRS = {
|
||||||
|
LIKE_NUM: like_num
|
||||||
|
}
|
35
spacy/lang/th/__init__.py
Normal file
35
spacy/lang/th/__init__.py
Normal file
|
@ -0,0 +1,35 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
from .tag_map import TAG_MAP
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
from ...tokens import Doc
|
||||||
|
from ..norm_exceptions import BASE_NORMS
|
||||||
|
from ...language import Language
|
||||||
|
from ...attrs import LANG, NORM
|
||||||
|
from ...util import update_exc, add_lookups
|
||||||
|
|
||||||
|
class ThaiDefaults(Language.Defaults):
|
||||||
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
|
lex_attr_getters[LANG] = lambda text: 'th'
|
||||||
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
tag_map = dict(TAG_MAP)
|
||||||
|
stop_words = set(STOP_WORDS)
|
||||||
|
|
||||||
|
|
||||||
|
class Thai(Language):
|
||||||
|
lang = 'th'
|
||||||
|
Defaults = ThaiDefaults
|
||||||
|
def make_doc(self, text):
|
||||||
|
try:
|
||||||
|
from pythainlp.tokenize import word_tokenize
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError("The Thai tokenizer requires the PyThaiNLP library: "
|
||||||
|
"https://github.com/wannaphongcom/pythainlp/")
|
||||||
|
words = [x for x in list(word_tokenize(text,"newmm"))]
|
||||||
|
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||||
|
|
||||||
|
__all__ = ['Thai']
|
62
spacy/lang/th/stop_words.py
Normal file
62
spacy/lang/th/stop_words.py
Normal file
|
@ -0,0 +1,62 @@
|
||||||
|
# encoding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
# data from https://github.com/wannaphongcom/pythainlp/blob/dev/pythainlp/corpus/stopwords-th.txt
|
||||||
|
# stop words as whitespace-separated list
|
||||||
|
STOP_WORDS = set("""
|
||||||
|
นี้ นํา นั้น นัก นอกจาก ทุก ที่สุด ที่ ทําให้ ทํา ทาง ทั้งนี้ ดัง ซึ่ง ช่วง จาก จัด จะ คือ ความ ครั้ง คง ขึ้น ของ
|
||||||
|
ขอ รับ ระหว่าง รวม ยัง มี มาก มา พร้อม พบ ผ่าน ผล บาง น่า เปิดเผย เปิด เนื่องจาก เดียวกัน เดียว เช่น เฉพาะ เข้า ถ้า
|
||||||
|
ถูก ถึง ต้อง ต่างๆ ต่าง ต่อ ตาม ตั้งแต่ ตั้ง ด้าน ด้วย อีก อาจ ออก อย่าง อะไร อยู่ อยาก หาก หลาย หลังจาก แต่ เอง เห็น
|
||||||
|
เลย เริ่ม เรา เมื่อ เพื่อ เพราะ เป็นการ เป็น หลัง หรือ หนึ่ง ส่วน ส่ง สุด สําหรับ ว่า ลง ร่วม ราย ขณะ ก่อน ก็ การ กับ กัน
|
||||||
|
กว่า กล่าว จึง ไว้ ไป ได้ ให้ ใน โดย แห่ง แล้ว และ แรก แบบ ๆ ทั้ง วัน เขา เคย ไม่ อยาก เกิน เกินๆ เกี่ยวกัน เกี่ยวกับ
|
||||||
|
เกี่ยวข้อง เกี่ยวเนื่อง เกี่ยวๆ เกือบ เกือบจะ เกือบๆ แก แก่ แก้ไข ใกล้ ใกล้ๆ ไกล ไกลๆ ขณะเดียวกัน ขณะใด ขณะใดๆ ขณะที่ ขณะนั้น ขณะนี้ ขณะหนึ่ง ขวาง
|
||||||
|
ขวางๆ ขั้น ใคร ใคร่ ใคร่จะ ใครๆ ง่าย ง่ายๆ ไง จง จด จน จนกระทั่ง จนกว่า จนขณะนี้ จนตลอด จนถึง จนทั่ว จนบัดนี้ จนเมื่อ จนแม้ จนแม้น
|
||||||
|
จรด จรดกับ จริง จริงจัง จริงๆ จริงๆจังๆ จวน จวนจะ จวนเจียน จวบ ซึ่งก็ ซึ่งก็คือ ซึ่งกัน ซึ่งกันและกัน ซึ่งได้แก่ ซึ่งๆ ณ ด้วย ด้วยกัน ด้วยเช่นกัน ด้วยที่ ด้วยประการฉะนี้
|
||||||
|
ด้วยเพราะ ด้วยว่า ด้วยเหตุที่ ด้วยเหตุนั้น ด้วยเหตุนี้ ด้วยเหตุเพราะ ด้วยเหตุว่า ด้วยเหมือนกัน ดั่ง ดังกล่าว ดังกับ ดั่งกับ ดังกับว่า ดั่งกับว่า ดังเก่า
|
||||||
|
ดั่งเก่า ดังเคย ใดๆ ได้ ได้แก่ ได้แต่ ได้ที่ ได้มา ได้รับ ตน ตนเอง ตนฯ ตรง ตรงๆ ตลอด ตลอดกาล ตลอดกาลนาน ตลอดจน ตลอดถึง ตลอดทั้ง
|
||||||
|
ตลอดทั่ว ตลอดทั่วถึง ตลอดทั่วทั้ง ตลอดปี ตลอดไป ตลอดมา ตลอดระยะเวลา ตลอดวัน ตลอดเวลา ตลอดศก ต่อ ต่อกัน ถึงแก่ ถึงจะ ถึงบัดนั้น ถึงบัดนี้
|
||||||
|
ถึงเมื่อ ถึงเมื่อใด ถึงเมื่อไร ถึงแม้ ถึงแม้จะ ถึงแม้ว่า ถึงอย่างไร ถือ ถือว่า ถูกต้อง ถูกๆ เถอะ เถิด ทรง ทว่า ทั้งคน ทั้งตัว ทั้งที ทั้งที่ ทั้งนั้น ทั้งนั้นด้วย ทั้งนั้นเพราะ
|
||||||
|
นอก นอกจากที่ นอกจากนั้น นอกจากนี้ นอกจากว่า นอกนั้น นอกเหนือ นอกเหนือจาก น้อย น้อยกว่า น้อยๆ นะ น่ะ นักๆ นั่น นั่นไง นั่นเป็น นั่นแหละ
|
||||||
|
นั่นเอง นั้นๆ นับ นับจากนั้น นับจากนี้ นับตั้งแต่ นับแต่ นับแต่ที่ นับแต่นั้น เป็นต้น เป็นต้นไป เป็นต้นมา เป็นแต่ เป็นแต่เพียง เป็นที เป็นที่ เป็นที่สุด เป็นเพราะ
|
||||||
|
เป็นเพราะว่า เป็นเพียง เป็นเพียงว่า เป็นเพื่อ เป็นอัน เป็นอันมาก เป็นอันว่า เป็นอันๆ เป็นอาทิ เป็นๆ เปลี่ยน เปลี่ยนแปลง เปิด เปิดเผย ไป่ ผ่าน ผ่านๆ
|
||||||
|
ผิด ผิดๆ ผู้ เพียงเพื่อ เพียงไร เพียงไหน เพื่อที่ เพื่อที่จะ เพื่อว่า เพื่อให้ ภาค ภาคฯ ภาย ภายใต้ ภายนอก ภายใน ภายภาค ภายภาคหน้า ภายหน้า ภายหลัง
|
||||||
|
มอง มองว่า มัก มักจะ มัน มันๆ มั้ย มั้ยนะ มั้ยนั่น มั้ยเนี่ย มั้ยล่ะ ยืนนาน ยืนยง ยืนยัน ยืนยาว เยอะ เยอะแยะ เยอะๆ แยะ แยะๆ รวด รวดเร็ว ร่วม รวมกัน ร่วมกัน
|
||||||
|
รวมด้วย ร่วมด้วย รวมถึง รวมทั้ง ร่วมมือ รวมๆ ระยะ ระยะๆ ระหว่าง รับรอง รึ รึว่า รือ รือว่า สิ้นกาลนาน สืบเนื่อง สุดๆ สู่ สูง สูงกว่า สูงส่ง สูงสุด สูงๆ เสมือนกับ
|
||||||
|
เสมือนว่า เสร็จ เสร็จกัน เสร็จแล้ว เสร็จสมบูรณ์ เสร็จสิ้น เสีย เสียก่อน เสียจน เสียจนกระทั่ง เสียจนถึง เสียด้วย เสียนั่น เสียนั่นเอง เสียนี่ เสียนี่กระไร เสียยิ่ง
|
||||||
|
เสียยิ่งนัก เสียแล้ว ใหญ่ๆ ให้ดี ให้แด่ ให้ไป ใหม่ ให้มา ใหม่ๆ ไหน ไหนๆ อดีต อนึ่ง อย่าง อย่างเช่น อย่างดี อย่างเดียว อย่างใด อย่างที่ อย่างน้อย อย่างนั้น
|
||||||
|
อย่างนี้ อย่างโน้น ก็คือ ก็แค่ ก็จะ ก็ดี ก็ได้ ก็ต่อเมื่อ ก็ตาม ก็ตามแต่ ก็ตามที ก็แล้วแต่ กระทั่ง กระทำ กระนั้น กระผม กลับ กล่าวคือ กลุ่ม กลุ่มก้อน
|
||||||
|
กลุ่มๆ กว้าง กว้างขวาง กว้างๆ ก่อนหน้า ก่อนหน้านี้ ก่อนๆ กันดีกว่า กันดีไหม กันเถอะ กันนะ กันและกัน กันไหม กันเอง กำลัง กำลังจะ กำหนด กู เก็บ
|
||||||
|
เกิด เกี่ยวข้อง แก่ แก้ไข ใกล้ ใกล้ๆ ข้า ข้าง ข้างเคียง ข้างต้น ข้างบน ข้างล่าง ข้างๆ ขาด ข้าพเจ้า ข้าฯ เข้าใจ เขียน คงจะ คงอยู่ ครบ ครบครัน ครบถ้วน
|
||||||
|
ครั้งกระนั้น ครั้งก่อน ครั้งครา ครั้งคราว ครั้งใด ครั้งที่ ครั้งนั้น ครั้งนี้ ครั้งละ ครั้งหนึ่ง ครั้งหลัง ครั้งหลังสุด ครั้งไหน ครั้งๆ ครัน ครับ ครา คราใด คราที่ ครานั้น ครานี้ คราหนึ่ง
|
||||||
|
คราไหน คราว คราวก่อน คราวใด คราวที่ คราวนั้น คราวนี้ คราวโน้น คราวละ คราวหน้า คราวหนึ่ง คราวหลัง คราวไหน คราวๆ คล้าย คล้ายกัน คล้ายกันกับ
|
||||||
|
คล้ายกับ คล้ายกับว่า คล้ายว่า ควร ค่อน ค่อนข้าง ค่อนข้างจะ ค่อยไปทาง ค่อนมาทาง ค่อย ค่อยๆ คะ ค่ะ คำ คิด คิดว่า คุณ คุณๆ
|
||||||
|
เคยๆ แค่ แค่จะ แค่นั้น แค่นี้ แค่เพียง แค่ว่า แค่ไหน ใคร่ ใคร่จะ ง่าย ง่ายๆ จนกว่า จนแม้ จนแม้น จังๆ จวบกับ จวบจน จ้ะ จ๊ะ จะได้ จัง จัดการ จัดงาน จัดแจง
|
||||||
|
จัดตั้ง จัดทำ จัดหา จัดให้ จับ จ้า จ๋า จากนั้น จากนี้ จากนี้ไป จำ จำเป็น จำพวก จึงจะ จึงเป็น จู่ๆ ฉะนั้น ฉะนี้ ฉัน เฉกเช่น เฉย เฉยๆ ไฉน ช่วงก่อน
|
||||||
|
ช่วงต่อไป ช่วงถัดไป ช่วงท้าย ช่วงที่ ช่วงนั้น ช่วงนี้ ช่วงระหว่าง ช่วงแรก ช่วงหน้า ช่วงหลัง ช่วงๆ ช่วย ช้า ช้านาน ชาว ช้าๆ เช่นก่อน เช่นกัน เช่นเคย
|
||||||
|
เช่นดัง เช่นดังก่อน เช่นดังเก่า เช่นดังที่ เช่นดังว่า เช่นเดียวกัน เช่นเดียวกับ เช่นใด เช่นที่ เช่นที่เคย เช่นที่ว่า เช่นนั้น เช่นนั้นเอง เช่นนี้ เช่นเมื่อ เช่นไร เชื่อ
|
||||||
|
เชื่อถือ เชื่อมั่น เชื่อว่า ใช่ ใช่ไหม ใช้ ซะ ซะก่อน ซะจน ซะจนกระทั่ง ซะจนถึง ซึ่งได้แก่ ด้วยกัน ด้วยเช่นกัน ด้วยที่ ด้วยเพราะ ด้วยว่า ด้วยเหตุที่ ด้วยเหตุนั้น
|
||||||
|
ด้วยเหตุนี้ ด้วยเหตุเพราะ ด้วยเหตุว่า ด้วยเหมือนกัน ดังกล่าว ดังกับว่า ดั่งกับว่า ดังเก่า ดั่งเก่า ดั่งเคย ต่างก็ ต่างหาก ตามด้วย ตามแต่ ตามที่
|
||||||
|
ตามๆ เต็มไปด้วย เต็มไปหมด เต็มๆ แต่ก็ แต่ก่อน แต่จะ แต่เดิม แต่ต้อง แต่ถ้า แต่ทว่า แต่ที่ แต่นั้น แต่เพียง แต่เมื่อ แต่ไร แต่ละ แต่ว่า แต่ไหน แต่อย่างใด โต
|
||||||
|
โตๆ ใต้ ถ้าจะ ถ้าหาก ถึงแก่ ถึงแม้ ถึงแม้จะ ถึงแม้ว่า ถึงอย่างไร ถือว่า ถูกต้อง ทว่า ทั้งนั้นด้วย ทั้งปวง ทั้งเป็น ทั้งมวล ทั้งสิ้น ทั้งหมด ทั้งหลาย ทั้งๆ ทัน
|
||||||
|
ทันใดนั้น ทันที ทันทีทันใด ทั่ว ทำไม ทำไร ทำให้ ทำๆ ที ที่จริง ที่ซึ่ง ทีเดียว ทีใด ที่ใด ที่ได้ ทีเถอะ ที่แท้ ที่แท้จริง ที่นั้น ที่นี้ ทีไร ทีละ ที่ละ
|
||||||
|
ที่แล้ว ที่ว่า ที่แห่งนั้น ที่ไหน ทีๆ ที่ๆ ทุกคน ทุกครั้ง ทุกครา ทุกคราว ทุกชิ้น ทุกตัว ทุกทาง ทุกที ทุกที่ ทุกเมื่อ ทุกวัน ทุกวันนี้ ทุกสิ่ง ทุกหน ทุกแห่ง ทุกอย่าง
|
||||||
|
ทุกอัน ทุกๆ เท่า เท่ากัน เท่ากับ เท่าใด เท่าที่ เท่านั้น เท่านี้ เท่าไร เท่าไหร่ แท้ แท้จริง เธอ นอกจากว่า น้อย น้อยกว่า น้อยๆ น่ะ นั้นไว นับแต่นี้ นาง
|
||||||
|
นางสาว น่าจะ นาน นานๆ นาย นำ นำพา นำมา นิด นิดหน่อย นิดๆ นี่ นี่ไง นี่นา นี่แน่ะ นี่แหละ นี้แหล่ นี่เอง นี้เอง นู่น นู้น เน้น เนี่ย
|
||||||
|
เนี่ยเอง ในช่วง ในที่ ในเมื่อ ในระหว่าง บน บอก บอกแล้ว บอกว่า บ่อย บ่อยกว่า บ่อยครั้ง บ่อยๆ บัดดล บัดเดี๋ยวนี้ บัดนั้น บัดนี้ บ้าง บางกว่า
|
||||||
|
บางขณะ บางครั้ง บางครา บางคราว บางที บางที่ บางแห่ง บางๆ ปฏิบัติ ประกอบ ประการ ประการฉะนี้ ประการใด ประการหนึ่ง ประมาณ ประสบ ปรับ
|
||||||
|
ปรากฏ ปรากฏว่า ปัจจุบัน ปิด เป็นด้วย เป็นดัง เป็นต้น เป็นแต่ เป็นเพื่อ เป็นอัน เป็นอันมาก เป็นอาทิ ผ่านๆ ผู้ ผู้ใด เผื่อ เผื่อจะ เผื่อที่ เผื่อว่า ฝ่าย
|
||||||
|
ฝ่ายใด พบว่า พยายาม พร้อมกัน พร้อมกับ พร้อมด้วย พร้อมทั้ง พร้อมที่ พร้อมเพียง พวก พวกกัน พวกกู พวกแก พวกเขา พวกคุณ พวกฉัน พวกท่าน
|
||||||
|
พวกที่ พวกเธอ พวกนั้น พวกนี้ พวกนู้น พวกโน้น พวกมัน พวกมึง พอ พอกัน พอควร พอจะ พอดี พอตัว พอที พอที่ พอเพียง พอแล้ว พอสม พอสมควร
|
||||||
|
พอเหมาะ พอๆ พา พึง พึ่ง พื้นๆ พูด เพราะฉะนั้น เพราะว่า เพิ่ง เพิ่งจะ เพิ่ม เพิ่มเติม เพียง เพียงแค่ เพียงใด เพียงแต่ เพียงพอ เพียงเพราะ
|
||||||
|
เพื่อว่า เพื่อให้ ภายใต้ มองว่า มั๊ย มากกว่า มากมาย มิ มิฉะนั้น มิใช่ มิได้ มีแต่ มึง มุ่ง มุ่งเน้น มุ่งหมาย เมื่อก่อน เมื่อครั้ง เมื่อครั้งก่อน
|
||||||
|
เมื่อคราวก่อน เมื่อคราวที่ เมื่อคราว เมื่อคืน เมื่อเช้า เมื่อใด เมื่อนั้น เมื่อนี้ เมื่อเย็น เมื่อไร เมื่อวันวาน เมื่อวาน เมื่อไหร่ แม้ แม้กระทั่ง แม้แต่ แม้นว่า แม้ว่า
|
||||||
|
ไม่ค่อย ไม่ค่อยจะ ไม่ค่อยเป็น ไม่ใช่ ไม่เป็นไร ไม่ว่า ยก ยกให้ ยอม ยอมรับ ย่อม ย่อย ยังคง ยังงั้น ยังงี้ ยังโง้น ยังไง ยังจะ ยังแต่ ยาก
|
||||||
|
ยาว ยาวนาน ยิ่ง ยิ่งกว่า ยิ่งขึ้น ยิ่งขึ้นไป ยิ่งจน ยิ่งจะ ยิ่งนัก ยิ่งเมื่อ ยิ่งแล้ว ยิ่งใหญ่ ร่วมกัน รวมด้วย ร่วมด้วย รือว่า เร็ว เร็วๆ เราๆ เรียก เรียบ เรื่อย
|
||||||
|
เรื่อยๆ ไร ล้วน ล้วนจน ล้วนแต่ ละ ล่าสุด เล็ก เล็กน้อย เล็กๆ เล่าว่า แล้วกัน แล้วแต่ แล้วเสร็จ วันใด วันนั้น วันนี้ วันไหน สบาย สมัย สมัยก่อน
|
||||||
|
สมัยนั้น สมัยนี้ สมัยโน้น ส่วนเกิน ส่วนด้อย ส่วนดี ส่วนใด ส่วนที่ ส่วนน้อย ส่วนนั้น ส่วนมาก ส่วนใหญ่ สั้น สั้นๆ สามารถ สำคัญ สิ่ง
|
||||||
|
สิ่งใด สิ่งนั้น สิ่งนี้ สิ่งไหน สิ้น เสร็จแล้ว เสียด้วย เสียแล้ว แสดง แสดงว่า หน หนอ หนอย หน่อย หมด หมดกัน หมดสิ้น หรือไง หรือเปล่า หรือไม่ หรือยัง
|
||||||
|
หรือไร หากแม้ หากแม้น หากแม้นว่า หากว่า หาความ หาใช่ หารือ เหตุ เหตุผล เหตุนั้น เหตุนี้ เหตุไร เห็นแก่ เห็นควร เห็นจะ เห็นว่า เหลือ เหลือเกิน เหล่า
|
||||||
|
เหล่านั้น เหล่านี้ แห่งใด แห่งนั้น แห่งนี้ แห่งโน้น แห่งไหน แหละ ให้แก่ ใหญ่ ใหญ่โต อย่างเช่น อย่างดี อย่างเดียว อย่างใด อย่างที่ อย่างน้อย อย่างนั้น อย่างนี้
|
||||||
|
อย่างโน้น อย่างมาก อย่างยิ่ง อย่างไร อย่างไรก็ อย่างไรก็ได้ อย่างไรเสีย อย่างละ อย่างหนึ่ง อย่างไหน อย่างๆ อัน อันจะ อันใด อันได้แก่ อันที่
|
||||||
|
อันที่จริง อันที่จะ อันเนื่องมาจาก อันละ อันไหน อันๆ อาจจะ อาจเป็น อาจเป็นด้วย อื่น อื่นๆ เอ็ง เอา ฯ ฯล ฯลฯ
|
||||||
|
""".split())
|
81
spacy/lang/th/tag_map.py
Normal file
81
spacy/lang/th/tag_map.py
Normal file
|
@ -0,0 +1,81 @@
|
||||||
|
# encoding: utf8
|
||||||
|
# data from Korakot Chaovavanich (https://www.facebook.com/photo.php?fbid=390564854695031&set=p.390564854695031&type=3&permPage=1&ifg=1)
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ...symbols import *
|
||||||
|
|
||||||
|
TAG_MAP = {
|
||||||
|
#NOUN
|
||||||
|
"NOUN": {POS: NOUN},
|
||||||
|
"NCMN": {POS: NOUN},
|
||||||
|
"NTTL": {POS: NOUN},
|
||||||
|
"CNIT": {POS: NOUN},
|
||||||
|
"CLTV": {POS: NOUN},
|
||||||
|
"CMTR": {POS: NOUN},
|
||||||
|
"CFQC": {POS: NOUN},
|
||||||
|
"CVBL": {POS: NOUN},
|
||||||
|
#PRON
|
||||||
|
"PRON": {POS: PRON},
|
||||||
|
"NPRP": {POS: PRON},
|
||||||
|
# ADJ
|
||||||
|
"ADJ": {POS: ADJ},
|
||||||
|
"NONM": {POS: ADJ},
|
||||||
|
"VATT": {POS: ADJ},
|
||||||
|
"DONM": {POS: ADJ},
|
||||||
|
# ADV
|
||||||
|
"ADV": {POS: ADV},
|
||||||
|
"ADVN": {POS: ADV},
|
||||||
|
"ADVI": {POS: ADV},
|
||||||
|
"ADVP": {POS: ADV},
|
||||||
|
"ADVS": {POS: ADV},
|
||||||
|
# INT
|
||||||
|
"INT": {POS: INTJ},
|
||||||
|
# PRON
|
||||||
|
"PROPN": {POS: PROPN},
|
||||||
|
"PPRS": {POS: PROPN},
|
||||||
|
"PDMN": {POS: PROPN},
|
||||||
|
"PNTR": {POS: PROPN},
|
||||||
|
# DET
|
||||||
|
"DET": {POS: DET},
|
||||||
|
"DDAN": {POS: DET},
|
||||||
|
"DDAC": {POS: DET},
|
||||||
|
"DDBQ": {POS: DET},
|
||||||
|
"DDAQ": {POS: DET},
|
||||||
|
"DIAC": {POS: DET},
|
||||||
|
"DIBQ": {POS: DET},
|
||||||
|
"DIAQ": {POS: DET},
|
||||||
|
"DCNM": {POS: DET},
|
||||||
|
# NUM
|
||||||
|
"NUM": {POS: NUM},
|
||||||
|
"NCNM": {POS: NUM},
|
||||||
|
"NLBL": {POS: NUM},
|
||||||
|
"DCNM": {POS: NUM},
|
||||||
|
# AUX
|
||||||
|
"AUX": {POS: AUX},
|
||||||
|
"XVBM": {POS: AUX},
|
||||||
|
"XVAM": {POS: AUX},
|
||||||
|
"XVMM": {POS: AUX},
|
||||||
|
"XVBB": {POS: AUX},
|
||||||
|
"XVAE": {POS: AUX},
|
||||||
|
# ADP
|
||||||
|
"ADP": {POS: ADP},
|
||||||
|
"RPRE": {POS: ADP},
|
||||||
|
# CCONJ
|
||||||
|
"CCONJ": {POS: CCONJ},
|
||||||
|
"JCRG": {POS: CCONJ},
|
||||||
|
# SCONJ
|
||||||
|
"SCONJ": {POS: SCONJ},
|
||||||
|
"PREL": {POS: SCONJ},
|
||||||
|
"JSBR": {POS: SCONJ},
|
||||||
|
"JCMP": {POS: SCONJ},
|
||||||
|
# PART
|
||||||
|
"PART": {POS: PART},
|
||||||
|
"FIXN": {POS: PART},
|
||||||
|
"FIXV": {POS: PART},
|
||||||
|
"EAFF": {POS: PART},
|
||||||
|
"AITT": {POS: PART},
|
||||||
|
"NEG": {POS: PART},
|
||||||
|
# PUNCT
|
||||||
|
"PUNCT": {POS: PUNCT},
|
||||||
|
"PUNC": {POS: PUNCT}
|
||||||
|
}
|
43
spacy/lang/th/tokenizer_exceptions.py
Normal file
43
spacy/lang/th/tokenizer_exceptions.py
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
# encoding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ...symbols import *
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS = {
|
||||||
|
"ม.ค.": [
|
||||||
|
{ORTH: "ม.ค.", LEMMA: "มกราคม"}
|
||||||
|
],
|
||||||
|
"ก.พ.": [
|
||||||
|
{ORTH: "ก.พ.", LEMMA: "กุมภาพันธ์"}
|
||||||
|
],
|
||||||
|
"มี.ค.": [
|
||||||
|
{ORTH: "มี.ค.", LEMMA: "มีนาคม"}
|
||||||
|
],
|
||||||
|
"เม.ย.": [
|
||||||
|
{ORTH: "เม.ย.", LEMMA: "เมษายน"}
|
||||||
|
],
|
||||||
|
"พ.ค.": [
|
||||||
|
{ORTH: "พ.ค.", LEMMA: "พฤษภาคม"}
|
||||||
|
],
|
||||||
|
"มิ.ย.": [
|
||||||
|
{ORTH: "มิ.ย.", LEMMA: "มิถุนายน"}
|
||||||
|
],
|
||||||
|
"ก.ค.": [
|
||||||
|
{ORTH: "ก.ค.", LEMMA: "กรกฎาคม"}
|
||||||
|
],
|
||||||
|
"ส.ค.": [
|
||||||
|
{ORTH: "ส.ค.", LEMMA: "สิงหาคม"}
|
||||||
|
],
|
||||||
|
"ก.ย.": [
|
||||||
|
{ORTH: "ก.ย.", LEMMA: "กันยายน"}
|
||||||
|
],
|
||||||
|
"ต.ค.": [
|
||||||
|
{ORTH: "ต.ค.", LEMMA: "ตุลาคม"}
|
||||||
|
],
|
||||||
|
"พ.ย.": [
|
||||||
|
{ORTH: "พ.ย.", LEMMA: "พฤศจิกายน"}
|
||||||
|
],
|
||||||
|
"ธ.ค.": [
|
||||||
|
{ORTH: "ธ.ค.", LEMMA: "ธันวาคม"}
|
||||||
|
]
|
||||||
|
}
|
|
@ -14,8 +14,8 @@ class Chinese(Language):
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError("The Chinese tokenizer requires the Jieba library: "
|
raise ImportError("The Chinese tokenizer requires the Jieba library: "
|
||||||
"https://github.com/fxsjy/jieba")
|
"https://github.com/fxsjy/jieba")
|
||||||
words = list(jieba.cut(text, cut_all=True))
|
words = list(jieba.cut(text, cut_all=False))
|
||||||
words=[x for x in words if x]
|
words = [x for x in words if x]
|
||||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -34,6 +34,7 @@ from .lang.tag_map import TAG_MAP
|
||||||
from .lang.lex_attrs import LEX_ATTRS
|
from .lang.lex_attrs import LEX_ATTRS
|
||||||
from . import util
|
from . import util
|
||||||
from .scorer import Scorer
|
from .scorer import Scorer
|
||||||
|
from ._ml import link_vectors_to_models
|
||||||
|
|
||||||
|
|
||||||
class BaseDefaults(object):
|
class BaseDefaults(object):
|
||||||
|
@ -278,8 +279,7 @@ class Language(object):
|
||||||
def make_doc(self, text):
|
def make_doc(self, text):
|
||||||
return self.tokenizer(text)
|
return self.tokenizer(text)
|
||||||
|
|
||||||
def update(self, docs, golds, drop=0., sgd=None, losses=None,
|
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
||||||
update_shared=False):
|
|
||||||
"""Update the models in the pipeline.
|
"""Update the models in the pipeline.
|
||||||
|
|
||||||
docs (iterable): A batch of `Doc` objects.
|
docs (iterable): A batch of `Doc` objects.
|
||||||
|
@ -303,32 +303,17 @@ class Language(object):
|
||||||
if self._optimizer is None:
|
if self._optimizer is None:
|
||||||
self._optimizer = Adam(Model.ops, 0.001)
|
self._optimizer = Adam(Model.ops, 0.001)
|
||||||
sgd = self._optimizer
|
sgd = self._optimizer
|
||||||
tok2vec = self.pipeline[0]
|
|
||||||
feats = tok2vec.doc2feats(docs)
|
|
||||||
grads = {}
|
grads = {}
|
||||||
def get_grads(W, dW, key=None):
|
def get_grads(W, dW, key=None):
|
||||||
grads[key] = (W, dW)
|
grads[key] = (W, dW)
|
||||||
pipes = list(self.pipeline[1:])
|
pipes = list(self.pipeline)
|
||||||
random.shuffle(pipes)
|
random.shuffle(pipes)
|
||||||
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
|
|
||||||
all_d_tokvecses = [tok2vec.model.ops.allocate(tv.shape) for tv in tokvecses]
|
|
||||||
for proc in pipes:
|
for proc in pipes:
|
||||||
if not hasattr(proc, 'update'):
|
if not hasattr(proc, 'update'):
|
||||||
continue
|
continue
|
||||||
d_tokvecses = proc.update((docs, tokvecses), golds,
|
proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses)
|
||||||
drop=drop, sgd=get_grads, losses=losses)
|
|
||||||
if update_shared and d_tokvecses is not None:
|
|
||||||
for i, d_tv in enumerate(d_tokvecses):
|
|
||||||
all_d_tokvecses[i] += d_tv
|
|
||||||
if update_shared and bp_tokvecses is not None:
|
|
||||||
bp_tokvecses(all_d_tokvecses, sgd=sgd)
|
|
||||||
for key, (W, dW) in grads.items():
|
for key, (W, dW) in grads.items():
|
||||||
sgd(W, dW, key=key)
|
sgd(W, dW, key=key)
|
||||||
# Clear the tensor variable, to free GPU memory.
|
|
||||||
# If we don't do this, the memory leak gets pretty
|
|
||||||
# bad, because we may be holding part of a batch.
|
|
||||||
for doc in docs:
|
|
||||||
doc.tensor = None
|
|
||||||
|
|
||||||
def preprocess_gold(self, docs_golds):
|
def preprocess_gold(self, docs_golds):
|
||||||
"""Can be called before training to pre-process gold data. By default,
|
"""Can be called before training to pre-process gold data. By default,
|
||||||
|
@ -343,36 +328,49 @@ class Language(object):
|
||||||
for doc, gold in docs_golds:
|
for doc, gold in docs_golds:
|
||||||
yield doc, gold
|
yield doc, gold
|
||||||
|
|
||||||
def begin_training(self, get_gold_tuples, **cfg):
|
def resume_training(self, **cfg):
|
||||||
|
if cfg.get('device', -1) >= 0:
|
||||||
|
device = util.use_gpu(cfg['device'])
|
||||||
|
if self.vocab.vectors.data.shape[1] >= 1:
|
||||||
|
self.vocab.vectors.data = Model.ops.asarray(
|
||||||
|
self.vocab.vectors.data)
|
||||||
|
else:
|
||||||
|
device = None
|
||||||
|
learn_rate = util.env_opt('learn_rate', 0.001)
|
||||||
|
beta1 = util.env_opt('optimizer_B1', 0.9)
|
||||||
|
beta2 = util.env_opt('optimizer_B2', 0.999)
|
||||||
|
eps = util.env_opt('optimizer_eps', 1e-08)
|
||||||
|
L2 = util.env_opt('L2_penalty', 1e-6)
|
||||||
|
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
|
||||||
|
self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
|
||||||
|
beta2=beta2, eps=eps)
|
||||||
|
self._optimizer.max_grad_norm = max_grad_norm
|
||||||
|
self._optimizer.device = device
|
||||||
|
return self._optimizer
|
||||||
|
|
||||||
|
def begin_training(self, get_gold_tuples=None, **cfg):
|
||||||
"""Allocate models, pre-process training data and acquire a trainer and
|
"""Allocate models, pre-process training data and acquire a trainer and
|
||||||
optimizer. Used as a contextmanager.
|
optimizer. Used as a contextmanager.
|
||||||
|
|
||||||
gold_tuples (iterable): Gold-standard training data.
|
get_gold_tuples (function): Function returning gold data
|
||||||
**cfg: Config parameters.
|
**cfg: Config parameters.
|
||||||
YIELDS (tuple): A trainer and an optimizer.
|
returns: An optimizer
|
||||||
|
|
||||||
EXAMPLE:
|
|
||||||
>>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
|
|
||||||
>>> for epoch in trainer.epochs(gold):
|
|
||||||
>>> for docs, golds in epoch:
|
|
||||||
>>> state = nlp.update(docs, golds, sgd=optimizer)
|
|
||||||
"""
|
"""
|
||||||
if self.parser:
|
|
||||||
self.pipeline.append(NeuralLabeller(self.vocab))
|
|
||||||
# Populate vocab
|
# Populate vocab
|
||||||
for _, annots_brackets in get_gold_tuples():
|
if get_gold_tuples is not None:
|
||||||
for annots, _ in annots_brackets:
|
for _, annots_brackets in get_gold_tuples():
|
||||||
for word in annots[1]:
|
for annots, _ in annots_brackets:
|
||||||
_ = self.vocab[word]
|
for word in annots[1]:
|
||||||
|
_ = self.vocab[word]
|
||||||
contexts = []
|
contexts = []
|
||||||
if cfg.get('device', -1) >= 0:
|
if cfg.get('device', -1) >= 0:
|
||||||
import cupy.cuda.device
|
device = util.use_gpu(cfg['device'])
|
||||||
device = cupy.cuda.device.Device(cfg['device'])
|
if self.vocab.vectors.data.shape[1] >= 1:
|
||||||
device.use()
|
self.vocab.vectors.data = Model.ops.asarray(
|
||||||
Model.ops = CupyOps()
|
self.vocab.vectors.data)
|
||||||
Model.Ops = CupyOps
|
|
||||||
else:
|
else:
|
||||||
device = None
|
device = None
|
||||||
|
link_vectors_to_models(self.vocab)
|
||||||
for proc in self.pipeline:
|
for proc in self.pipeline:
|
||||||
if hasattr(proc, 'begin_training'):
|
if hasattr(proc, 'begin_training'):
|
||||||
context = proc.begin_training(get_gold_tuples(),
|
context = proc.begin_training(get_gold_tuples(),
|
||||||
|
@ -390,7 +388,7 @@ class Language(object):
|
||||||
self._optimizer.device = device
|
self._optimizer.device = device
|
||||||
return self._optimizer
|
return self._optimizer
|
||||||
|
|
||||||
def evaluate(self, docs_golds):
|
def evaluate(self, docs_golds, verbose=False):
|
||||||
scorer = Scorer()
|
scorer = Scorer()
|
||||||
docs, golds = zip(*docs_golds)
|
docs, golds = zip(*docs_golds)
|
||||||
docs = list(docs)
|
docs = list(docs)
|
||||||
|
@ -403,8 +401,9 @@ class Language(object):
|
||||||
docs = list(pipe.pipe(docs))
|
docs = list(pipe.pipe(docs))
|
||||||
assert len(docs) == len(golds)
|
assert len(docs) == len(golds)
|
||||||
for doc, gold in zip(docs, golds):
|
for doc, gold in zip(docs, golds):
|
||||||
scorer.score(doc, gold)
|
if verbose:
|
||||||
doc.tensor = None
|
print(doc)
|
||||||
|
scorer.score(doc, gold, verbose=verbose)
|
||||||
return scorer
|
return scorer
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
|
@ -493,7 +492,6 @@ class Language(object):
|
||||||
"""
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
serializers = OrderedDict((
|
serializers = OrderedDict((
|
||||||
('vocab', lambda p: self.vocab.to_disk(p)),
|
|
||||||
('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)),
|
('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)),
|
||||||
('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
|
('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
|
||||||
))
|
))
|
||||||
|
@ -505,6 +503,7 @@ class Language(object):
|
||||||
if not hasattr(proc, 'to_disk'):
|
if not hasattr(proc, 'to_disk'):
|
||||||
continue
|
continue
|
||||||
serializers[proc.name] = lambda p, proc=proc: proc.to_disk(p, vocab=False)
|
serializers[proc.name] = lambda p, proc=proc: proc.to_disk(p, vocab=False)
|
||||||
|
serializers['vocab'] = lambda p: self.vocab.to_disk(p)
|
||||||
util.to_disk(path, serializers, {p: False for p in disable})
|
util.to_disk(path, serializers, {p: False for p in disable})
|
||||||
|
|
||||||
def from_disk(self, path, disable=tuple()):
|
def from_disk(self, path, disable=tuple()):
|
||||||
|
|
|
@ -38,7 +38,8 @@ class Lemmatizer(object):
|
||||||
avoid lemmatization entirely.
|
avoid lemmatization entirely.
|
||||||
"""
|
"""
|
||||||
morphology = {} if morphology is None else morphology
|
morphology = {} if morphology is None else morphology
|
||||||
others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
|
others = [key for key in morphology
|
||||||
|
if key not in (POS, 'Number', 'POS', 'VerbForm', 'Tense')]
|
||||||
true_morph_key = morphology.get('morph', 0)
|
true_morph_key = morphology.get('morph', 0)
|
||||||
if univ_pos == 'noun' and morphology.get('Number') == 'sing':
|
if univ_pos == 'noun' and morphology.get('Number') == 'sing':
|
||||||
return True
|
return True
|
||||||
|
@ -47,7 +48,9 @@ class Lemmatizer(object):
|
||||||
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
|
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
|
||||||
# morphology
|
# morphology
|
||||||
elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and \
|
elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and \
|
||||||
morphology.get('Tense') == 'pres'):
|
morphology.get('Tense') == 'pres' and \
|
||||||
|
morphology.get('Number') is None and \
|
||||||
|
not others):
|
||||||
return True
|
return True
|
||||||
elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
|
elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
|
||||||
return True
|
return True
|
||||||
|
|
|
@ -421,47 +421,69 @@ cdef class PhraseMatcher:
|
||||||
cdef int max_length
|
cdef int max_length
|
||||||
cdef attr_t* _phrase_key
|
cdef attr_t* _phrase_key
|
||||||
|
|
||||||
def __init__(self, Vocab vocab, phrases, max_length=10):
|
cdef public object _callbacks
|
||||||
|
cdef public object _patterns
|
||||||
|
|
||||||
|
def __init__(self, Vocab vocab, max_length=10):
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self._phrase_key = <attr_t*>self.mem.alloc(max_length, sizeof(attr_t))
|
self._phrase_key = <attr_t*>self.mem.alloc(max_length, sizeof(attr_t))
|
||||||
self.max_length = max_length
|
self.max_length = max_length
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.matcher = Matcher(self.vocab, {})
|
self.matcher = Matcher(self.vocab)
|
||||||
self.phrase_ids = PreshMap()
|
self.phrase_ids = PreshMap()
|
||||||
for phrase in phrases:
|
|
||||||
if len(phrase) < max_length:
|
|
||||||
self.add(phrase)
|
|
||||||
|
|
||||||
abstract_patterns = []
|
abstract_patterns = []
|
||||||
for length in range(1, max_length):
|
for length in range(1, max_length):
|
||||||
abstract_patterns.append([{tag: True} for tag in get_bilou(length)])
|
abstract_patterns.append([{tag: True} for tag in get_bilou(length)])
|
||||||
self.matcher.add('Candidate', 'MWE', {}, abstract_patterns, acceptor=self.accept_match)
|
self.matcher.add('Candidate', None, *abstract_patterns)
|
||||||
|
self._callbacks = {}
|
||||||
|
|
||||||
def add(self, Doc tokens):
|
def __len__(self):
|
||||||
cdef int length = tokens.length
|
raise NotImplementedError
|
||||||
assert length < self.max_length
|
|
||||||
tags = get_bilou(length)
|
|
||||||
assert len(tags) == length, length
|
|
||||||
|
|
||||||
|
def __contains__(self, key):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def __reduce__(self):
|
||||||
|
return (self.__class__, (self.vocab,), None, None)
|
||||||
|
|
||||||
|
def add(self, key, on_match, *docs):
|
||||||
|
cdef Doc doc
|
||||||
|
for doc in docs:
|
||||||
|
if len(doc) >= self.max_length:
|
||||||
|
msg = (
|
||||||
|
"Pattern length (%d) >= phrase_matcher.max_length (%d). "
|
||||||
|
"Length can be set on initialization, up to 10."
|
||||||
|
)
|
||||||
|
raise ValueError(msg % (len(doc), self.max_length))
|
||||||
|
cdef hash_t ent_id = self.matcher._normalize_key(key)
|
||||||
|
self._callbacks[ent_id] = on_match
|
||||||
|
|
||||||
|
cdef int length
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(self.max_length):
|
cdef hash_t phrase_hash
|
||||||
self._phrase_key[i] = 0
|
for doc in docs:
|
||||||
for i, tag in enumerate(tags):
|
length = doc.length
|
||||||
lexeme = self.vocab[tokens.c[i].lex.orth]
|
tags = get_bilou(length)
|
||||||
lexeme.set_flag(tag, True)
|
for i in range(self.max_length):
|
||||||
self._phrase_key[i] = lexeme.orth
|
self._phrase_key[i] = 0
|
||||||
cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0)
|
for i, tag in enumerate(tags):
|
||||||
self.phrase_ids[key] = True
|
lexeme = self.vocab[doc.c[i].lex.orth]
|
||||||
|
lexeme.set_flag(tag, True)
|
||||||
|
self._phrase_key[i] = lexeme.orth
|
||||||
|
phrase_hash = hash64(self._phrase_key,
|
||||||
|
self.max_length * sizeof(attr_t), 0)
|
||||||
|
self.phrase_ids.set(phrase_hash, <void*>ent_id)
|
||||||
|
|
||||||
def __call__(self, Doc doc):
|
def __call__(self, Doc doc):
|
||||||
matches = []
|
matches = []
|
||||||
for ent_id, label, start, end in self.matcher(doc):
|
for _, start, end in self.matcher(doc):
|
||||||
cand = doc[start : end]
|
ent_id = self.accept_match(doc, start, end)
|
||||||
start = cand[0].idx
|
if ent_id is not None:
|
||||||
end = cand[-1].idx + len(cand[-1])
|
matches.append((ent_id, start, end))
|
||||||
matches.append((start, end, cand.root.tag_, cand.text, 'MWE'))
|
for i, (ent_id, start, end) in enumerate(matches):
|
||||||
for match in matches:
|
on_match = self._callbacks.get(ent_id)
|
||||||
doc.merge(*match)
|
if on_match is not None:
|
||||||
|
on_match(self, doc, i, matches)
|
||||||
return matches
|
return matches
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=1000, n_threads=2):
|
def pipe(self, stream, batch_size=1000, n_threads=2):
|
||||||
|
@ -469,7 +491,7 @@ cdef class PhraseMatcher:
|
||||||
self(doc)
|
self(doc)
|
||||||
yield doc
|
yield doc
|
||||||
|
|
||||||
def accept_match(self, Doc doc, attr_t ent_id, attr_t label, int start, int end):
|
def accept_match(self, Doc doc, int start, int end):
|
||||||
assert (end - start) < self.max_length
|
assert (end - start) < self.max_length
|
||||||
cdef int i, j
|
cdef int i, j
|
||||||
for i in range(self.max_length):
|
for i in range(self.max_length):
|
||||||
|
@ -477,7 +499,8 @@ cdef class PhraseMatcher:
|
||||||
for i, j in enumerate(range(start, end)):
|
for i, j in enumerate(range(start, end)):
|
||||||
self._phrase_key[i] = doc.c[j].lex.orth
|
self._phrase_key[i] = doc.c[j].lex.orth
|
||||||
cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0)
|
cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0)
|
||||||
if self.phrase_ids.get(key):
|
ent_id = <hash_t>self.phrase_ids.get(key)
|
||||||
return (ent_id, label, start, end)
|
if ent_id == 0:
|
||||||
|
return None
|
||||||
else:
|
else:
|
||||||
return False
|
return ent_id
|
||||||
|
|
|
@ -146,6 +146,8 @@ cdef class Morphology:
|
||||||
self.add_special_case(tag_str, form_str, attrs)
|
self.add_special_case(tag_str, form_str, attrs)
|
||||||
|
|
||||||
def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
|
def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
|
||||||
|
if orth not in self.strings:
|
||||||
|
return orth
|
||||||
cdef unicode py_string = self.strings[orth]
|
cdef unicode py_string = self.strings[orth]
|
||||||
if self.lemmatizer is None:
|
if self.lemmatizer is None:
|
||||||
return self.strings.add(py_string.lower())
|
return self.strings.add(py_string.lower())
|
||||||
|
|
|
@ -4,7 +4,6 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from thinc.api import chain, layerize, with_getitem
|
from thinc.api import chain, layerize, with_getitem
|
||||||
from thinc.neural import Model, Softmax
|
|
||||||
import numpy
|
import numpy
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
import cytoolz
|
import cytoolz
|
||||||
|
@ -14,17 +13,18 @@ import ujson
|
||||||
import msgpack
|
import msgpack
|
||||||
|
|
||||||
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
|
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
|
||||||
from thinc.neural import Model, Maxout, Softmax, Affine
|
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
|
||||||
from thinc.neural._classes.hash_embed import HashEmbed
|
from thinc.i2v import HashEmbed
|
||||||
|
from thinc.t2v import Pooling, max_pool, mean_pool, sum_pool
|
||||||
|
from thinc.t2t import ExtractWindow, ParametricAttention
|
||||||
|
from thinc.misc import Residual
|
||||||
|
from thinc.misc import BatchNorm as BN
|
||||||
|
from thinc.misc import LayerNorm as LN
|
||||||
|
|
||||||
from thinc.neural.util import to_categorical
|
from thinc.neural.util import to_categorical
|
||||||
|
|
||||||
from thinc.neural.pooling import Pooling, max_pool, mean_pool
|
|
||||||
from thinc.neural._classes.difference import Siamese, CauchySimilarity
|
from thinc.neural._classes.difference import Siamese, CauchySimilarity
|
||||||
|
|
||||||
from thinc.neural._classes.convolution import ExtractWindow
|
|
||||||
from thinc.neural._classes.resnet import Residual
|
|
||||||
from thinc.neural._classes.batchnorm import BatchNorm as BN
|
|
||||||
|
|
||||||
from .tokens.doc cimport Doc
|
from .tokens.doc cimport Doc
|
||||||
from .syntax.parser cimport Parser as LinearParser
|
from .syntax.parser cimport Parser as LinearParser
|
||||||
from .syntax.nn_parser cimport Parser as NeuralParser
|
from .syntax.nn_parser cimport Parser as NeuralParser
|
||||||
|
@ -41,13 +41,14 @@ from .syntax import nonproj
|
||||||
from .compat import json_dumps
|
from .compat import json_dumps
|
||||||
|
|
||||||
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
|
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
|
||||||
from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats
|
from ._ml import rebatch, Tok2Vec, flatten
|
||||||
from ._ml import build_text_classifier, build_tagger_model
|
from ._ml import build_text_classifier, build_tagger_model
|
||||||
|
from ._ml import link_vectors_to_models
|
||||||
from .parts_of_speech import X
|
from .parts_of_speech import X
|
||||||
|
|
||||||
|
|
||||||
class SentenceSegmenter(object):
|
class SentenceSegmenter(object):
|
||||||
'''A simple spaCy hook, to allow custom sentence boundary detection logic
|
"""A simple spaCy hook, to allow custom sentence boundary detection logic
|
||||||
(that doesn't require the dependency parse).
|
(that doesn't require the dependency parse).
|
||||||
|
|
||||||
To change the sentence boundary detection strategy, pass a generator
|
To change the sentence boundary detection strategy, pass a generator
|
||||||
|
@ -56,7 +57,7 @@ class SentenceSegmenter(object):
|
||||||
|
|
||||||
Sentence detection strategies should be generators that take `Doc` objects
|
Sentence detection strategies should be generators that take `Doc` objects
|
||||||
and yield `Span` objects for each sentence.
|
and yield `Span` objects for each sentence.
|
||||||
'''
|
"""
|
||||||
name = 'sbd'
|
name = 'sbd'
|
||||||
|
|
||||||
def __init__(self, vocab, strategy=None):
|
def __init__(self, vocab, strategy=None):
|
||||||
|
@ -88,17 +89,30 @@ class BaseThincComponent(object):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, *shape, **kwargs):
|
def Model(cls, *shape, **kwargs):
|
||||||
|
"""Initialize a model for the pipe."""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def __init__(self, vocab, model=True, **cfg):
|
def __init__(self, vocab, model=True, **cfg):
|
||||||
|
"""Create a new pipe instance."""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
|
"""Apply the pipe to one document. The document is
|
||||||
|
modified in-place, and returned.
|
||||||
|
|
||||||
|
Both __call__ and pipe should delegate to the `predict()`
|
||||||
|
and `set_annotations()` methods.
|
||||||
|
"""
|
||||||
scores = self.predict([doc])
|
scores = self.predict([doc])
|
||||||
self.set_annotations([doc], scores)
|
self.set_annotations([doc], scores)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||||
|
"""Apply the pipe to a stream of documents.
|
||||||
|
|
||||||
|
Both __call__ and pipe should delegate to the `predict()`
|
||||||
|
and `set_annotations()` methods.
|
||||||
|
"""
|
||||||
for docs in cytoolz.partition_all(batch_size, stream):
|
for docs in cytoolz.partition_all(batch_size, stream):
|
||||||
docs = list(docs)
|
docs = list(docs)
|
||||||
scores = self.predict(docs)
|
scores = self.predict(docs)
|
||||||
|
@ -106,27 +120,43 @@ class BaseThincComponent(object):
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
|
"""Apply the pipeline's model to a batch of docs, without
|
||||||
|
modifying them.
|
||||||
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def set_annotations(self, docs, scores):
|
def set_annotations(self, docs, scores):
|
||||||
|
"""Modify a batch of documents, using pre-computed scores."""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None):
|
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
||||||
|
"""Learn from a batch of documents and gold-standard information,
|
||||||
|
updating the pipe's model.
|
||||||
|
|
||||||
|
Delegates to predict() and get_loss().
|
||||||
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def get_loss(self, docs, golds, scores):
|
def get_loss(self, docs, golds, scores):
|
||||||
|
"""Find the loss and gradient of loss for the batch of
|
||||||
|
documents and their predicted scores."""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def begin_training(self, gold_tuples=tuple(), pipeline=None):
|
def begin_training(self, gold_tuples=tuple(), pipeline=None):
|
||||||
token_vector_width = pipeline[0].model.nO
|
"""Initialize the pipe for training, using data exampes if available.
|
||||||
|
If no model has been initialized yet, the model is added."""
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.model = self.Model(1, token_vector_width)
|
self.model = self.Model(**self.cfg)
|
||||||
|
link_vectors_to_models(self.vocab)
|
||||||
|
|
||||||
def use_params(self, params):
|
def use_params(self, params):
|
||||||
|
"""Modify the pipe's model, to use the given parameter values.
|
||||||
|
"""
|
||||||
with self.model.use_params(params):
|
with self.model.use_params(params):
|
||||||
yield
|
yield
|
||||||
|
|
||||||
def to_bytes(self, **exclude):
|
def to_bytes(self, **exclude):
|
||||||
|
"""Serialize the pipe to a bytestring."""
|
||||||
serialize = OrderedDict((
|
serialize = OrderedDict((
|
||||||
('cfg', lambda: json_dumps(self.cfg)),
|
('cfg', lambda: json_dumps(self.cfg)),
|
||||||
('model', lambda: self.model.to_bytes()),
|
('model', lambda: self.model.to_bytes()),
|
||||||
|
@ -135,37 +165,42 @@ class BaseThincComponent(object):
|
||||||
return util.to_bytes(serialize, exclude)
|
return util.to_bytes(serialize, exclude)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, **exclude):
|
def from_bytes(self, bytes_data, **exclude):
|
||||||
|
"""Load the pipe from a bytestring."""
|
||||||
def load_model(b):
|
def load_model(b):
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
|
self.cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||||
self.model = self.Model(**self.cfg)
|
self.model = self.Model(**self.cfg)
|
||||||
self.model.from_bytes(b)
|
self.model.from_bytes(b)
|
||||||
|
|
||||||
deserialize = OrderedDict((
|
deserialize = OrderedDict((
|
||||||
('cfg', lambda b: self.cfg.update(ujson.loads(b))),
|
('cfg', lambda b: self.cfg.update(ujson.loads(b))),
|
||||||
|
('vocab', lambda b: self.vocab.from_bytes(b)),
|
||||||
('model', load_model),
|
('model', load_model),
|
||||||
('vocab', lambda b: self.vocab.from_bytes(b))
|
|
||||||
))
|
))
|
||||||
util.from_bytes(bytes_data, deserialize, exclude)
|
util.from_bytes(bytes_data, deserialize, exclude)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_disk(self, path, **exclude):
|
def to_disk(self, path, **exclude):
|
||||||
|
"""Serialize the pipe to disk."""
|
||||||
serialize = OrderedDict((
|
serialize = OrderedDict((
|
||||||
('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))),
|
('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))),
|
||||||
|
('vocab', lambda p: self.vocab.to_disk(p)),
|
||||||
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
|
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
|
||||||
('vocab', lambda p: self.vocab.to_disk(p))
|
|
||||||
))
|
))
|
||||||
util.to_disk(path, serialize, exclude)
|
util.to_disk(path, serialize, exclude)
|
||||||
|
|
||||||
def from_disk(self, path, **exclude):
|
def from_disk(self, path, **exclude):
|
||||||
|
"""Load the pipe from disk."""
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
|
self.cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||||
self.model = self.Model(**self.cfg)
|
self.model = self.Model(**self.cfg)
|
||||||
self.model.from_bytes(p.open('rb').read())
|
self.model.from_bytes(p.open('rb').read())
|
||||||
|
|
||||||
deserialize = OrderedDict((
|
deserialize = OrderedDict((
|
||||||
('cfg', lambda p: self.cfg.update(_load_cfg(p))),
|
('cfg', lambda p: self.cfg.update(_load_cfg(p))),
|
||||||
('model', load_model),
|
|
||||||
('vocab', lambda p: self.vocab.from_disk(p)),
|
('vocab', lambda p: self.vocab.from_disk(p)),
|
||||||
|
('model', load_model),
|
||||||
))
|
))
|
||||||
util.from_disk(path, deserialize, exclude)
|
util.from_disk(path, deserialize, exclude)
|
||||||
return self
|
return self
|
||||||
|
@ -193,7 +228,7 @@ class TokenVectorEncoder(BaseThincComponent):
|
||||||
"""
|
"""
|
||||||
width = util.env_opt('token_vector_width', width)
|
width = util.env_opt('token_vector_width', width)
|
||||||
embed_size = util.env_opt('embed_size', embed_size)
|
embed_size = util.env_opt('embed_size', embed_size)
|
||||||
return Tok2Vec(width, embed_size, preprocess=None)
|
return Tok2Vec(width, embed_size, **cfg)
|
||||||
|
|
||||||
def __init__(self, vocab, model=True, **cfg):
|
def __init__(self, vocab, model=True, **cfg):
|
||||||
"""Construct a new statistical model. Weights are not allocated on
|
"""Construct a new statistical model. Weights are not allocated on
|
||||||
|
@ -210,9 +245,10 @@ class TokenVectorEncoder(BaseThincComponent):
|
||||||
>>> tok2vec.model = tok2vec.Model(128, 5000)
|
>>> tok2vec.model = tok2vec.Model(128, 5000)
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.doc2feats = doc2feats()
|
|
||||||
self.model = model
|
self.model = model
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
|
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
||||||
|
self.cfg.setdefault('cnn_maxout_pieces', 3)
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
"""Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
|
"""Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
|
||||||
|
@ -245,8 +281,7 @@ class TokenVectorEncoder(BaseThincComponent):
|
||||||
docs (iterable): A sequence of `Doc` objects.
|
docs (iterable): A sequence of `Doc` objects.
|
||||||
RETURNS (object): Vector representations for each token in the documents.
|
RETURNS (object): Vector representations for each token in the documents.
|
||||||
"""
|
"""
|
||||||
feats = self.doc2feats(docs)
|
tokvecs = self.model(docs)
|
||||||
tokvecs = self.model(feats)
|
|
||||||
return tokvecs
|
return tokvecs
|
||||||
|
|
||||||
def set_annotations(self, docs, tokvecses):
|
def set_annotations(self, docs, tokvecses):
|
||||||
|
@ -270,8 +305,7 @@ class TokenVectorEncoder(BaseThincComponent):
|
||||||
"""
|
"""
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
feats = self.doc2feats(docs)
|
tokvecs, bp_tokvecs = self.model.begin_update(docs, drop=drop)
|
||||||
tokvecs, bp_tokvecs = self.model.begin_update(feats, drop=drop)
|
|
||||||
return tokvecs, bp_tokvecs
|
return tokvecs, bp_tokvecs
|
||||||
|
|
||||||
def get_loss(self, docs, golds, scores):
|
def get_loss(self, docs, golds, scores):
|
||||||
|
@ -285,9 +319,10 @@ class TokenVectorEncoder(BaseThincComponent):
|
||||||
gold_tuples (iterable): Gold-standard training data.
|
gold_tuples (iterable): Gold-standard training data.
|
||||||
pipeline (list): The pipeline the model is part of.
|
pipeline (list): The pipeline the model is part of.
|
||||||
"""
|
"""
|
||||||
self.doc2feats = doc2feats()
|
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.model = self.Model()
|
self.cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||||
|
self.model = self.Model(**self.cfg)
|
||||||
|
link_vectors_to_models(self.vocab)
|
||||||
|
|
||||||
|
|
||||||
class NeuralTagger(BaseThincComponent):
|
class NeuralTagger(BaseThincComponent):
|
||||||
|
@ -296,29 +331,29 @@ class NeuralTagger(BaseThincComponent):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
|
self.cfg.setdefault('cnn_maxout_pieces', 2)
|
||||||
|
self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
tags = self.predict(([doc], [doc.tensor]))
|
tags = self.predict([doc])
|
||||||
self.set_annotations([doc], tags)
|
self.set_annotations([doc], tags)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||||
for docs in cytoolz.partition_all(batch_size, stream):
|
for docs in cytoolz.partition_all(batch_size, stream):
|
||||||
docs = list(docs)
|
docs = list(docs)
|
||||||
tokvecs = [d.tensor for d in docs]
|
tag_ids = self.predict(docs)
|
||||||
tag_ids = self.predict((docs, tokvecs))
|
|
||||||
self.set_annotations(docs, tag_ids)
|
self.set_annotations(docs, tag_ids)
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs_tokvecs):
|
def predict(self, docs):
|
||||||
scores = self.model(docs_tokvecs)
|
scores = self.model(docs)
|
||||||
scores = self.model.ops.flatten(scores)
|
scores = self.model.ops.flatten(scores)
|
||||||
guesses = scores.argmax(axis=1)
|
guesses = scores.argmax(axis=1)
|
||||||
if not isinstance(guesses, numpy.ndarray):
|
if not isinstance(guesses, numpy.ndarray):
|
||||||
guesses = guesses.get()
|
guesses = guesses.get()
|
||||||
tokvecs = docs_tokvecs[1]
|
|
||||||
guesses = self.model.ops.unflatten(guesses,
|
guesses = self.model.ops.unflatten(guesses,
|
||||||
[tv.shape[0] for tv in tokvecs])
|
[len(d) for d in docs])
|
||||||
return guesses
|
return guesses
|
||||||
|
|
||||||
def set_annotations(self, docs, batch_tag_ids):
|
def set_annotations(self, docs, batch_tag_ids):
|
||||||
|
@ -338,20 +373,16 @@ class NeuralTagger(BaseThincComponent):
|
||||||
idx += 1
|
idx += 1
|
||||||
doc.is_tagged = True
|
doc.is_tagged = True
|
||||||
|
|
||||||
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
|
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
||||||
if losses is not None and self.name not in losses:
|
if losses is not None and self.name not in losses:
|
||||||
losses[self.name] = 0.
|
losses[self.name] = 0.
|
||||||
docs, tokvecs = docs_tokvecs
|
|
||||||
|
|
||||||
if self.model.nI is None:
|
tag_scores, bp_tag_scores = self.model.begin_update(docs, drop=drop)
|
||||||
self.model.nI = tokvecs[0].shape[1]
|
|
||||||
tag_scores, bp_tag_scores = self.model.begin_update(docs_tokvecs, drop=drop)
|
|
||||||
loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
|
loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
|
||||||
|
bp_tag_scores(d_tag_scores, sgd=sgd)
|
||||||
|
|
||||||
d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
|
|
||||||
if losses is not None:
|
if losses is not None:
|
||||||
losses[self.name] += loss
|
losses[self.name] += loss
|
||||||
return d_tokvecs
|
|
||||||
|
|
||||||
def get_loss(self, docs, golds, scores):
|
def get_loss(self, docs, golds, scores):
|
||||||
scores = self.model.ops.flatten(scores)
|
scores = self.model.ops.flatten(scores)
|
||||||
|
@ -392,14 +423,15 @@ class NeuralTagger(BaseThincComponent):
|
||||||
vocab.morphology = Morphology(vocab.strings, new_tag_map,
|
vocab.morphology = Morphology(vocab.strings, new_tag_map,
|
||||||
vocab.morphology.lemmatizer,
|
vocab.morphology.lemmatizer,
|
||||||
exc=vocab.morphology.exc)
|
exc=vocab.morphology.exc)
|
||||||
token_vector_width = pipeline[0].model.nO
|
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
|
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
||||||
|
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
|
||||||
|
link_vectors_to_models(self.vocab)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, n_tags, token_vector_width):
|
def Model(cls, n_tags, **cfg):
|
||||||
return build_tagger_model(n_tags, token_vector_width)
|
return build_tagger_model(n_tags, **cfg)
|
||||||
|
|
||||||
def use_params(self, params):
|
def use_params(self, params):
|
||||||
with self.model.use_params(params):
|
with self.model.use_params(params):
|
||||||
yield
|
yield
|
||||||
|
@ -419,7 +451,7 @@ class NeuralTagger(BaseThincComponent):
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
token_vector_width = util.env_opt('token_vector_width',
|
token_vector_width = util.env_opt('token_vector_width',
|
||||||
self.cfg.get('token_vector_width', 128))
|
self.cfg.get('token_vector_width', 128))
|
||||||
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
|
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
|
||||||
self.model.from_bytes(b)
|
self.model.from_bytes(b)
|
||||||
|
|
||||||
def load_tag_map(b):
|
def load_tag_map(b):
|
||||||
|
@ -428,7 +460,7 @@ class NeuralTagger(BaseThincComponent):
|
||||||
self.vocab.strings, tag_map=tag_map,
|
self.vocab.strings, tag_map=tag_map,
|
||||||
lemmatizer=self.vocab.morphology.lemmatizer,
|
lemmatizer=self.vocab.morphology.lemmatizer,
|
||||||
exc=self.vocab.morphology.exc)
|
exc=self.vocab.morphology.exc)
|
||||||
|
|
||||||
deserialize = OrderedDict((
|
deserialize = OrderedDict((
|
||||||
('vocab', lambda b: self.vocab.from_bytes(b)),
|
('vocab', lambda b: self.vocab.from_bytes(b)),
|
||||||
('tag_map', load_tag_map),
|
('tag_map', load_tag_map),
|
||||||
|
@ -438,6 +470,7 @@ class NeuralTagger(BaseThincComponent):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_disk(self, path, **exclude):
|
def to_disk(self, path, **exclude):
|
||||||
|
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
||||||
serialize = OrderedDict((
|
serialize = OrderedDict((
|
||||||
('vocab', lambda p: self.vocab.to_disk(p)),
|
('vocab', lambda p: self.vocab.to_disk(p)),
|
||||||
('tag_map', lambda p: p.open('wb').write(msgpack.dumps(
|
('tag_map', lambda p: p.open('wb').write(msgpack.dumps(
|
||||||
|
@ -452,9 +485,7 @@ class NeuralTagger(BaseThincComponent):
|
||||||
def from_disk(self, path, **exclude):
|
def from_disk(self, path, **exclude):
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
token_vector_width = util.env_opt('token_vector_width',
|
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
|
||||||
self.cfg.get('token_vector_width', 128))
|
|
||||||
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
|
|
||||||
self.model.from_bytes(p.open('rb').read())
|
self.model.from_bytes(p.open('rb').read())
|
||||||
|
|
||||||
def load_tag_map(p):
|
def load_tag_map(p):
|
||||||
|
@ -466,10 +497,10 @@ class NeuralTagger(BaseThincComponent):
|
||||||
exc=self.vocab.morphology.exc)
|
exc=self.vocab.morphology.exc)
|
||||||
|
|
||||||
deserialize = OrderedDict((
|
deserialize = OrderedDict((
|
||||||
|
('cfg', lambda p: self.cfg.update(_load_cfg(p))),
|
||||||
('vocab', lambda p: self.vocab.from_disk(p)),
|
('vocab', lambda p: self.vocab.from_disk(p)),
|
||||||
('tag_map', load_tag_map),
|
('tag_map', load_tag_map),
|
||||||
('model', load_model),
|
('model', load_model),
|
||||||
('cfg', lambda p: self.cfg.update(_load_cfg(p)))
|
|
||||||
))
|
))
|
||||||
util.from_disk(path, deserialize, exclude)
|
util.from_disk(path, deserialize, exclude)
|
||||||
return self
|
return self
|
||||||
|
@ -477,10 +508,28 @@ class NeuralTagger(BaseThincComponent):
|
||||||
|
|
||||||
class NeuralLabeller(NeuralTagger):
|
class NeuralLabeller(NeuralTagger):
|
||||||
name = 'nn_labeller'
|
name = 'nn_labeller'
|
||||||
def __init__(self, vocab, model=True, **cfg):
|
def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
|
if target == 'dep':
|
||||||
|
self.make_label = self.make_dep
|
||||||
|
elif target == 'tag':
|
||||||
|
self.make_label = self.make_tag
|
||||||
|
elif target == 'ent':
|
||||||
|
self.make_label = self.make_ent
|
||||||
|
elif target == 'dep_tag_offset':
|
||||||
|
self.make_label = self.make_dep_tag_offset
|
||||||
|
elif target == 'ent_tag':
|
||||||
|
self.make_label = self.make_ent_tag
|
||||||
|
elif hasattr(target, '__call__'):
|
||||||
|
self.make_label = target
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
"NeuralLabeller target should be function or one of "
|
||||||
|
"['dep', 'tag', 'ent', 'dep_tag_offset', 'ent_tag']")
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
|
self.cfg.setdefault('cnn_maxout_pieces', 2)
|
||||||
|
self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
|
@ -493,41 +542,79 @@ class NeuralLabeller(NeuralTagger):
|
||||||
def set_annotations(self, docs, dep_ids):
|
def set_annotations(self, docs, dep_ids):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def begin_training(self, gold_tuples=tuple(), pipeline=None):
|
def begin_training(self, gold_tuples=tuple(), pipeline=None, tok2vec=None):
|
||||||
gold_tuples = nonproj.preprocess_training_data(gold_tuples)
|
gold_tuples = nonproj.preprocess_training_data(gold_tuples)
|
||||||
for raw_text, annots_brackets in gold_tuples:
|
for raw_text, annots_brackets in gold_tuples:
|
||||||
for annots, brackets in annots_brackets:
|
for annots, brackets in annots_brackets:
|
||||||
ids, words, tags, heads, deps, ents = annots
|
ids, words, tags, heads, deps, ents = annots
|
||||||
for dep in deps:
|
for i in range(len(ids)):
|
||||||
if dep not in self.labels:
|
label = self.make_label(i, words, tags, heads, deps, ents)
|
||||||
self.labels[dep] = len(self.labels)
|
if label is not None and label not in self.labels:
|
||||||
token_vector_width = pipeline[0].model.nO
|
self.labels[label] = len(self.labels)
|
||||||
|
print(len(self.labels))
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.model = self.Model(len(self.labels), token_vector_width)
|
token_vector_width = util.env_opt('token_vector_width')
|
||||||
|
self.model = chain(
|
||||||
|
tok2vec,
|
||||||
|
Softmax(len(self.labels), token_vector_width)
|
||||||
|
)
|
||||||
|
link_vectors_to_models(self.vocab)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, n_tags, token_vector_width):
|
def Model(cls, n_tags, tok2vec=None, **cfg):
|
||||||
return build_tagger_model(n_tags, token_vector_width)
|
return build_tagger_model(n_tags, tok2vec=tok2vec, **cfg)
|
||||||
|
|
||||||
def get_loss(self, docs, golds, scores):
|
def get_loss(self, docs, golds, scores):
|
||||||
scores = self.model.ops.flatten(scores)
|
|
||||||
cdef int idx = 0
|
cdef int idx = 0
|
||||||
correct = numpy.zeros((scores.shape[0],), dtype='i')
|
correct = numpy.zeros((scores.shape[0],), dtype='i')
|
||||||
guesses = scores.argmax(axis=1)
|
guesses = scores.argmax(axis=1)
|
||||||
for gold in golds:
|
for gold in golds:
|
||||||
for tag in gold.labels:
|
for i in range(len(gold.labels)):
|
||||||
if tag is None or tag not in self.labels:
|
label = self.make_label(i, gold.words, gold.tags, gold.heads,
|
||||||
|
gold.labels, gold.ents)
|
||||||
|
if label is None or label not in self.labels:
|
||||||
correct[idx] = guesses[idx]
|
correct[idx] = guesses[idx]
|
||||||
else:
|
else:
|
||||||
correct[idx] = self.labels[tag]
|
correct[idx] = self.labels[label]
|
||||||
idx += 1
|
idx += 1
|
||||||
correct = self.model.ops.xp.array(correct, dtype='i')
|
correct = self.model.ops.xp.array(correct, dtype='i')
|
||||||
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
|
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
|
||||||
d_scores /= d_scores.shape[0]
|
d_scores /= d_scores.shape[0]
|
||||||
loss = (d_scores**2).sum()
|
loss = (d_scores**2).sum()
|
||||||
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def make_dep(i, words, tags, heads, deps, ents):
|
||||||
|
if deps[i] is None or heads[i] is None:
|
||||||
|
return None
|
||||||
|
return deps[i]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def make_tag(i, words, tags, heads, deps, ents):
|
||||||
|
return tags[i]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def make_ent(i, words, tags, heads, deps, ents):
|
||||||
|
if ents is None:
|
||||||
|
return None
|
||||||
|
return ents[i]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def make_dep_tag_offset(i, words, tags, heads, deps, ents):
|
||||||
|
if deps[i] is None or heads[i] is None:
|
||||||
|
return None
|
||||||
|
offset = heads[i] - i
|
||||||
|
offset = min(offset, 2)
|
||||||
|
offset = max(offset, -2)
|
||||||
|
return '%s-%s:%d' % (deps[i], tags[i], offset)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def make_ent_tag(i, words, tags, heads, deps, ents):
|
||||||
|
if ents is None or ents[i] is None:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return '%s-%s' % (tags[i], ents[i])
|
||||||
|
|
||||||
|
|
||||||
class SimilarityHook(BaseThincComponent):
|
class SimilarityHook(BaseThincComponent):
|
||||||
"""
|
"""
|
||||||
|
@ -555,7 +642,7 @@ class SimilarityHook(BaseThincComponent):
|
||||||
return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length))
|
return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length))
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
'''Install similarity hook'''
|
"""Install similarity hook"""
|
||||||
doc.user_hooks['similarity'] = self.predict
|
doc.user_hooks['similarity'] = self.predict
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
@ -564,15 +651,10 @@ class SimilarityHook(BaseThincComponent):
|
||||||
yield self(doc)
|
yield self(doc)
|
||||||
|
|
||||||
def predict(self, doc1, doc2):
|
def predict(self, doc1, doc2):
|
||||||
return self.model.predict([(doc1.tensor, doc2.tensor)])
|
return self.model.predict([(doc1, doc2)])
|
||||||
|
|
||||||
def update(self, doc1_tensor1_doc2_tensor2, golds, sgd=None, drop=0.):
|
def update(self, doc1_doc2, golds, sgd=None, drop=0.):
|
||||||
doc1s, tensor1s, doc2s, tensor2s = doc1_tensor1_doc2_tensor2
|
sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop)
|
||||||
sims, bp_sims = self.model.begin_update(zip(tensor1s, tensor2s),
|
|
||||||
drop=drop)
|
|
||||||
d_tensor1s, d_tensor2s = bp_sims(golds, sgd=sgd)
|
|
||||||
|
|
||||||
return d_tensor1s, d_tensor2s
|
|
||||||
|
|
||||||
def begin_training(self, _=tuple(), pipeline=None):
|
def begin_training(self, _=tuple(), pipeline=None):
|
||||||
"""
|
"""
|
||||||
|
@ -583,6 +665,7 @@ class SimilarityHook(BaseThincComponent):
|
||||||
"""
|
"""
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.model = self.Model(pipeline[0].model.nO)
|
self.model = self.Model(pipeline[0].model.nO)
|
||||||
|
link_vectors_to_models(self.vocab)
|
||||||
|
|
||||||
|
|
||||||
class TextCategorizer(BaseThincComponent):
|
class TextCategorizer(BaseThincComponent):
|
||||||
|
@ -627,15 +710,13 @@ class TextCategorizer(BaseThincComponent):
|
||||||
for j, label in enumerate(self.labels):
|
for j, label in enumerate(self.labels):
|
||||||
doc.cats[label] = float(scores[i, j])
|
doc.cats[label] = float(scores[i, j])
|
||||||
|
|
||||||
def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None):
|
def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
|
||||||
docs, tensors = docs_tensors
|
|
||||||
scores, bp_scores = self.model.begin_update(docs, drop=drop)
|
scores, bp_scores = self.model.begin_update(docs, drop=drop)
|
||||||
loss, d_scores = self.get_loss(docs, golds, scores)
|
loss, d_scores = self.get_loss(docs, golds, scores)
|
||||||
d_tensors = bp_scores(d_scores, sgd=sgd)
|
bp_scores(d_scores, sgd=sgd)
|
||||||
if losses is not None:
|
if losses is not None:
|
||||||
losses.setdefault(self.name, 0.0)
|
losses.setdefault(self.name, 0.0)
|
||||||
losses[self.name] += loss
|
losses[self.name] += loss
|
||||||
return d_tensors
|
|
||||||
|
|
||||||
def get_loss(self, docs, golds, scores):
|
def get_loss(self, docs, golds, scores):
|
||||||
truths = numpy.zeros((len(golds), len(self.labels)), dtype='f')
|
truths = numpy.zeros((len(golds), len(self.labels)), dtype='f')
|
||||||
|
@ -653,8 +734,10 @@ class TextCategorizer(BaseThincComponent):
|
||||||
else:
|
else:
|
||||||
token_vector_width = 64
|
token_vector_width = 64
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
|
self.cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||||
self.model = self.Model(len(self.labels), token_vector_width,
|
self.model = self.Model(len(self.labels), token_vector_width,
|
||||||
**self.cfg)
|
**self.cfg)
|
||||||
|
link_vectors_to_models(self.vocab)
|
||||||
|
|
||||||
|
|
||||||
cdef class EntityRecognizer(LinearParser):
|
cdef class EntityRecognizer(LinearParser):
|
||||||
|
@ -695,6 +778,14 @@ cdef class NeuralDependencyParser(NeuralParser):
|
||||||
name = 'parser'
|
name = 'parser'
|
||||||
TransitionSystem = ArcEager
|
TransitionSystem = ArcEager
|
||||||
|
|
||||||
|
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
|
||||||
|
for target in []:
|
||||||
|
labeller = NeuralLabeller(self.vocab, target=target)
|
||||||
|
tok2vec = self.model[0]
|
||||||
|
labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
|
||||||
|
pipeline.append(labeller)
|
||||||
|
self._multitasks.append(labeller)
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None)
|
return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None)
|
||||||
|
|
||||||
|
@ -705,13 +796,13 @@ cdef class NeuralEntityRecognizer(NeuralParser):
|
||||||
|
|
||||||
nr_feature = 6
|
nr_feature = 6
|
||||||
|
|
||||||
def predict_confidences(self, docs):
|
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
|
||||||
tensors = [d.tensor for d in docs]
|
for target in []:
|
||||||
samples = []
|
labeller = NeuralLabeller(self.vocab, target=target)
|
||||||
for i in range(10):
|
tok2vec = self.model[0]
|
||||||
states = self.parse_batch(docs, tensors, drop=0.3)
|
labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
|
||||||
for state in states:
|
pipeline.append(labeller)
|
||||||
samples.append(self._get_entities(state))
|
self._multitasks.append(labeller)
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None)
|
return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None)
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
cpdef enum symbol_t:
|
cdef enum symbol_t:
|
||||||
NIL
|
NIL
|
||||||
IS_ALPHA
|
IS_ALPHA
|
||||||
IS_ASCII
|
IS_ASCII
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
|
#cython: optimize.unpack_method_calls=False
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
IDS = {
|
IDS = {
|
||||||
|
@ -458,4 +460,11 @@ IDS = {
|
||||||
"xcomp": xcomp
|
"xcomp": xcomp
|
||||||
}
|
}
|
||||||
|
|
||||||
NAMES = [it[0] for it in sorted(IDS.items(), key=lambda it: it[1])]
|
def sort_nums(x):
|
||||||
|
return x[1]
|
||||||
|
|
||||||
|
NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)]
|
||||||
|
# Unfortunate hack here, to work around problem with long cpdef enum
|
||||||
|
# (which is generating an enormous amount of C++ in Cython 0.24+)
|
||||||
|
# We keep the enum cdef, and just make sure the names are available to Python
|
||||||
|
locals().update(IDS)
|
||||||
|
|
|
@ -147,10 +147,10 @@ def get_token_ids(states, int n_tokens):
|
||||||
|
|
||||||
nr_update = 0
|
nr_update = 0
|
||||||
def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
|
def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
|
||||||
states, tokvecs, golds,
|
states, golds,
|
||||||
state2vec, vec2scores,
|
state2vec, vec2scores,
|
||||||
int width, float density,
|
int width, float density,
|
||||||
sgd=None, losses=None, drop=0.):
|
losses=None, drop=0.):
|
||||||
global nr_update
|
global nr_update
|
||||||
cdef MaxViolation violn
|
cdef MaxViolation violn
|
||||||
nr_update += 1
|
nr_update += 1
|
||||||
|
|
|
@ -101,9 +101,10 @@ cdef cppclass StateC:
|
||||||
elif n == 6:
|
elif n == 6:
|
||||||
if this.B(0) >= 0:
|
if this.B(0) >= 0:
|
||||||
ids[0] = this.B(0)
|
ids[0] = this.B(0)
|
||||||
|
ids[1] = this.B(0)-1
|
||||||
else:
|
else:
|
||||||
ids[0] = -1
|
ids[0] = -1
|
||||||
ids[1] = this.B(0)
|
ids[1] = -1
|
||||||
ids[2] = this.B(1)
|
ids[2] = this.B(1)
|
||||||
ids[3] = this.E(0)
|
ids[3] = this.E(0)
|
||||||
if ids[3] >= 1:
|
if ids[3] >= 1:
|
||||||
|
@ -120,6 +121,8 @@ cdef cppclass StateC:
|
||||||
for i in range(n):
|
for i in range(n):
|
||||||
if ids[i] >= 0:
|
if ids[i] >= 0:
|
||||||
ids[i] += this.offset
|
ids[i] += this.offset
|
||||||
|
else:
|
||||||
|
ids[i] = -1
|
||||||
|
|
||||||
int S(int i) nogil const:
|
int S(int i) nogil const:
|
||||||
if i >= this._s_i:
|
if i >= this._s_i:
|
||||||
|
@ -162,9 +165,9 @@ cdef cppclass StateC:
|
||||||
|
|
||||||
int E(int i) nogil const:
|
int E(int i) nogil const:
|
||||||
if this._e_i <= 0 or this._e_i >= this.length:
|
if this._e_i <= 0 or this._e_i >= this.length:
|
||||||
return 0
|
return -1
|
||||||
if i < 0 or i >= this._e_i:
|
if i < 0 or i >= this._e_i:
|
||||||
return 0
|
return -1
|
||||||
return this._ents[this._e_i - (i+1)].start
|
return this._ents[this._e_i - (i+1)].start
|
||||||
|
|
||||||
int L(int i, int idx) nogil const:
|
int L(int i, int idx) nogil const:
|
||||||
|
|
|
@ -161,8 +161,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
cdef Transition lookup_transition(self, object name) except *:
|
cdef Transition lookup_transition(self, object name) except *:
|
||||||
cdef attr_t label
|
cdef attr_t label
|
||||||
if name == '-' or name == None:
|
if name == '-' or name == None:
|
||||||
move_str = 'M'
|
return Transition(clas=0, move=MISSING, label=0, score=0)
|
||||||
label = 0
|
|
||||||
elif name == '!O':
|
elif name == '!O':
|
||||||
return Transition(clas=0, move=ISNT, label=0, score=0)
|
return Transition(clas=0, move=ISNT, label=0, score=0)
|
||||||
elif '-' in name:
|
elif '-' in name:
|
||||||
|
@ -220,6 +219,31 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
raise Exception(move)
|
raise Exception(move)
|
||||||
return t
|
return t
|
||||||
|
|
||||||
|
#def add_action(self, int action, label_name):
|
||||||
|
# cdef attr_t label_id
|
||||||
|
# if not isinstance(label_name, (int, long)):
|
||||||
|
# label_id = self.strings.add(label_name)
|
||||||
|
# else:
|
||||||
|
# label_id = label_name
|
||||||
|
# if action == OUT and label_id != 0:
|
||||||
|
# return
|
||||||
|
# if action == MISSING or action == ISNT:
|
||||||
|
# return
|
||||||
|
# # Check we're not creating a move we already have, so that this is
|
||||||
|
# # idempotent
|
||||||
|
# for trans in self.c[:self.n_moves]:
|
||||||
|
# if trans.move == action and trans.label == label_id:
|
||||||
|
# return 0
|
||||||
|
# if self.n_moves >= self._size:
|
||||||
|
# self._size *= 2
|
||||||
|
# self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0]))
|
||||||
|
# self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
|
||||||
|
# assert self.c[self.n_moves].label == label_id
|
||||||
|
# self.n_moves += 1
|
||||||
|
# return 1
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef int initialize_state(self, StateC* st) nogil:
|
cdef int initialize_state(self, StateC* st) nogil:
|
||||||
# This is especially necessary when we use limited training data.
|
# This is especially necessary when we use limited training data.
|
||||||
for i in range(st.length):
|
for i in range(st.length):
|
||||||
|
|
|
@ -13,6 +13,7 @@ cdef class Parser:
|
||||||
cdef public object model
|
cdef public object model
|
||||||
cdef readonly TransitionSystem moves
|
cdef readonly TransitionSystem moves
|
||||||
cdef readonly object cfg
|
cdef readonly object cfg
|
||||||
|
cdef public object _multitasks
|
||||||
|
|
||||||
cdef void _parse_step(self, StateC* state,
|
cdef void _parse_step(self, StateC* state,
|
||||||
const float* feat_weights,
|
const float* feat_weights,
|
||||||
|
|
|
@ -7,6 +7,7 @@ from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
from collections import Counter, OrderedDict
|
from collections import Counter, OrderedDict
|
||||||
import ujson
|
import ujson
|
||||||
|
import json
|
||||||
import contextlib
|
import contextlib
|
||||||
|
|
||||||
from libc.math cimport exp
|
from libc.math cimport exp
|
||||||
|
@ -37,10 +38,9 @@ from preshed.maps cimport MapStruct
|
||||||
from preshed.maps cimport map_get
|
from preshed.maps cimport map_get
|
||||||
|
|
||||||
from thinc.api import layerize, chain, noop, clone, with_flatten
|
from thinc.api import layerize, chain, noop, clone, with_flatten
|
||||||
from thinc.neural import Model, Affine, ReLu, Maxout
|
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
|
||||||
from thinc.neural._classes.batchnorm import BatchNorm as BN
|
from thinc.misc import LayerNorm
|
||||||
from thinc.neural._classes.selu import SELU
|
|
||||||
from thinc.neural._classes.layernorm import LayerNorm
|
|
||||||
from thinc.neural.ops import NumpyOps, CupyOps
|
from thinc.neural.ops import NumpyOps, CupyOps
|
||||||
from thinc.neural.util import get_array_module
|
from thinc.neural.util import get_array_module
|
||||||
|
|
||||||
|
@ -48,7 +48,8 @@ from .. import util
|
||||||
from ..util import get_async, get_cuda_stream
|
from ..util import get_async, get_cuda_stream
|
||||||
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
|
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
|
||||||
from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune
|
from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune
|
||||||
from .._ml import Residual, drop_layer
|
from .._ml import Residual, drop_layer, flatten
|
||||||
|
from .._ml import link_vectors_to_models
|
||||||
from ..compat import json_dumps
|
from ..compat import json_dumps
|
||||||
|
|
||||||
from . import _parse_features
|
from . import _parse_features
|
||||||
|
@ -238,14 +239,15 @@ cdef class Parser:
|
||||||
Base class of the DependencyParser and EntityRecognizer.
|
Base class of the DependencyParser and EntityRecognizer.
|
||||||
"""
|
"""
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, nr_class, token_vector_width=128, hidden_width=300, depth=1, **cfg):
|
def Model(cls, nr_class, token_vector_width=128, hidden_width=200, depth=1, **cfg):
|
||||||
depth = util.env_opt('parser_hidden_depth', depth)
|
depth = util.env_opt('parser_hidden_depth', depth)
|
||||||
token_vector_width = util.env_opt('token_vector_width', token_vector_width)
|
token_vector_width = util.env_opt('token_vector_width', token_vector_width)
|
||||||
hidden_width = util.env_opt('hidden_width', hidden_width)
|
hidden_width = util.env_opt('hidden_width', hidden_width)
|
||||||
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
|
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
|
||||||
embed_size = util.env_opt('embed_size', 4000)
|
embed_size = util.env_opt('embed_size', 7000)
|
||||||
tensors = fine_tune(Tok2Vec(token_vector_width, embed_size,
|
tok2vec = Tok2Vec(token_vector_width, embed_size,
|
||||||
preprocess=doc2feats()))
|
pretrained_dims=cfg.get('pretrained_dims', 0))
|
||||||
|
tok2vec = chain(tok2vec, flatten)
|
||||||
if parser_maxout_pieces == 1:
|
if parser_maxout_pieces == 1:
|
||||||
lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
|
lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
|
||||||
nF=cls.nr_feature,
|
nF=cls.nr_feature,
|
||||||
|
@ -262,8 +264,8 @@ cdef class Parser:
|
||||||
upper.is_noop = True
|
upper.is_noop = True
|
||||||
else:
|
else:
|
||||||
upper = chain(
|
upper = chain(
|
||||||
clone(Maxout(hidden_width), (depth-1)),
|
clone(Maxout(hidden_width), depth-1),
|
||||||
zero_init(Affine(nr_class, drop_factor=0.0))
|
zero_init(Affine(nr_class, hidden_width, drop_factor=0.0))
|
||||||
)
|
)
|
||||||
upper.is_noop = False
|
upper.is_noop = False
|
||||||
# TODO: This is an unfortunate hack atm!
|
# TODO: This is an unfortunate hack atm!
|
||||||
|
@ -277,7 +279,7 @@ cdef class Parser:
|
||||||
'hidden_width': hidden_width,
|
'hidden_width': hidden_width,
|
||||||
'maxout_pieces': parser_maxout_pieces
|
'maxout_pieces': parser_maxout_pieces
|
||||||
}
|
}
|
||||||
return (tensors, lower, upper), cfg
|
return (tok2vec, lower, upper), cfg
|
||||||
|
|
||||||
def __init__(self, Vocab vocab, moves=True, model=True, **cfg):
|
def __init__(self, Vocab vocab, moves=True, model=True, **cfg):
|
||||||
"""
|
"""
|
||||||
|
@ -307,12 +309,16 @@ cdef class Parser:
|
||||||
cfg['beam_width'] = util.env_opt('beam_width', 1)
|
cfg['beam_width'] = util.env_opt('beam_width', 1)
|
||||||
if 'beam_density' not in cfg:
|
if 'beam_density' not in cfg:
|
||||||
cfg['beam_density'] = util.env_opt('beam_density', 0.0)
|
cfg['beam_density'] = util.env_opt('beam_density', 0.0)
|
||||||
|
if 'pretrained_dims' not in cfg:
|
||||||
|
cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
||||||
|
cfg.setdefault('cnn_maxout_pieces', 3)
|
||||||
self.cfg = cfg
|
self.cfg = cfg
|
||||||
if 'actions' in self.cfg:
|
if 'actions' in self.cfg:
|
||||||
for action, labels in self.cfg.get('actions', {}).items():
|
for action, labels in self.cfg.get('actions', {}).items():
|
||||||
for label in labels:
|
for label in labels:
|
||||||
self.moves.add_action(action, label)
|
self.moves.add_action(action, label)
|
||||||
self.model = model
|
self.model = model
|
||||||
|
self._multitasks = []
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return (Parser, (self.vocab, self.moves, self.model), None, None)
|
return (Parser, (self.vocab, self.moves, self.model), None, None)
|
||||||
|
@ -332,11 +338,11 @@ cdef class Parser:
|
||||||
beam_density = self.cfg.get('beam_density', 0.0)
|
beam_density = self.cfg.get('beam_density', 0.0)
|
||||||
cdef Beam beam
|
cdef Beam beam
|
||||||
if beam_width == 1:
|
if beam_width == 1:
|
||||||
states = self.parse_batch([doc], [doc.tensor])
|
states = self.parse_batch([doc])
|
||||||
self.set_annotations([doc], states)
|
self.set_annotations([doc], states)
|
||||||
return doc
|
return doc
|
||||||
else:
|
else:
|
||||||
beam = self.beam_parse([doc], [doc.tensor],
|
beam = self.beam_parse([doc],
|
||||||
beam_width=beam_width, beam_density=beam_density)[0]
|
beam_width=beam_width, beam_density=beam_density)[0]
|
||||||
output = self.moves.get_beam_annot(beam)
|
output = self.moves.get_beam_annot(beam)
|
||||||
state = <StateClass>beam.at(0)
|
state = <StateClass>beam.at(0)
|
||||||
|
@ -365,11 +371,11 @@ cdef class Parser:
|
||||||
cdef Beam beam
|
cdef Beam beam
|
||||||
for docs in cytoolz.partition_all(batch_size, docs):
|
for docs in cytoolz.partition_all(batch_size, docs):
|
||||||
docs = list(docs)
|
docs = list(docs)
|
||||||
tokvecs = [doc.tensor for doc in docs]
|
|
||||||
if beam_width == 1:
|
if beam_width == 1:
|
||||||
parse_states = self.parse_batch(docs, tokvecs)
|
parse_states = self.parse_batch(docs)
|
||||||
|
beams = []
|
||||||
else:
|
else:
|
||||||
beams = self.beam_parse(docs, tokvecs,
|
beams = self.beam_parse(docs,
|
||||||
beam_width=beam_width, beam_density=beam_density)
|
beam_width=beam_width, beam_density=beam_density)
|
||||||
parse_states = []
|
parse_states = []
|
||||||
for beam in beams:
|
for beam in beams:
|
||||||
|
@ -377,7 +383,7 @@ cdef class Parser:
|
||||||
self.set_annotations(docs, parse_states)
|
self.set_annotations(docs, parse_states)
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def parse_batch(self, docs, tokvecses):
|
def parse_batch(self, docs):
|
||||||
cdef:
|
cdef:
|
||||||
precompute_hiddens state2vec
|
precompute_hiddens state2vec
|
||||||
StateClass state
|
StateClass state
|
||||||
|
@ -388,21 +394,15 @@ cdef class Parser:
|
||||||
int nr_class, nr_feat, nr_piece, nr_dim, nr_state
|
int nr_class, nr_feat, nr_piece, nr_dim, nr_state
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
if isinstance(tokvecses, np.ndarray):
|
|
||||||
tokvecses = [tokvecses]
|
|
||||||
|
|
||||||
tokvecs = self.model[0].ops.flatten(tokvecses)
|
cuda_stream = get_cuda_stream()
|
||||||
if USE_FINE_TUNE:
|
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
|
||||||
tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
|
0.0)
|
||||||
|
|
||||||
nr_state = len(docs)
|
nr_state = len(docs)
|
||||||
nr_class = self.moves.n_moves
|
nr_class = self.moves.n_moves
|
||||||
nr_dim = tokvecs.shape[1]
|
nr_dim = tokvecs.shape[1]
|
||||||
nr_feat = self.nr_feature
|
nr_feat = self.nr_feature
|
||||||
|
|
||||||
cuda_stream = get_cuda_stream()
|
|
||||||
state2vec, vec2scores = self.get_batch_model(nr_state, tokvecs,
|
|
||||||
cuda_stream, 0.0)
|
|
||||||
nr_piece = state2vec.nP
|
nr_piece = state2vec.nP
|
||||||
|
|
||||||
states = self.moves.init_batch(docs)
|
states = self.moves.init_batch(docs)
|
||||||
|
@ -418,21 +418,23 @@ cdef class Parser:
|
||||||
c_token_ids = <int*>token_ids.data
|
c_token_ids = <int*>token_ids.data
|
||||||
c_is_valid = <int*>is_valid.data
|
c_is_valid = <int*>is_valid.data
|
||||||
cdef int has_hidden = not getattr(vec2scores, 'is_noop', False)
|
cdef int has_hidden = not getattr(vec2scores, 'is_noop', False)
|
||||||
|
cdef int nr_step
|
||||||
while not next_step.empty():
|
while not next_step.empty():
|
||||||
|
nr_step = next_step.size()
|
||||||
if not has_hidden:
|
if not has_hidden:
|
||||||
for i in cython.parallel.prange(
|
for i in cython.parallel.prange(nr_step, num_threads=6,
|
||||||
next_step.size(), num_threads=6, nogil=True):
|
nogil=True):
|
||||||
self._parse_step(next_step[i],
|
self._parse_step(next_step[i],
|
||||||
feat_weights, nr_class, nr_feat, nr_piece)
|
feat_weights, nr_class, nr_feat, nr_piece)
|
||||||
else:
|
else:
|
||||||
for i in range(next_step.size()):
|
for i in range(nr_step):
|
||||||
st = next_step[i]
|
st = next_step[i]
|
||||||
st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat)
|
st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat)
|
||||||
self.moves.set_valid(&c_is_valid[i*nr_class], st)
|
self.moves.set_valid(&c_is_valid[i*nr_class], st)
|
||||||
vectors = state2vec(token_ids[:next_step.size()])
|
vectors = state2vec(token_ids[:next_step.size()])
|
||||||
scores = vec2scores(vectors)
|
scores = vec2scores(vectors)
|
||||||
c_scores = <float*>scores.data
|
c_scores = <float*>scores.data
|
||||||
for i in range(next_step.size()):
|
for i in range(nr_step):
|
||||||
st = next_step[i]
|
st = next_step[i]
|
||||||
guess = arg_max_if_valid(
|
guess = arg_max_if_valid(
|
||||||
&c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class)
|
&c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class)
|
||||||
|
@ -445,18 +447,15 @@ cdef class Parser:
|
||||||
next_step.push_back(st)
|
next_step.push_back(st)
|
||||||
return states
|
return states
|
||||||
|
|
||||||
def beam_parse(self, docs, tokvecses, int beam_width=3, float beam_density=0.001):
|
def beam_parse(self, docs, int beam_width=3, float beam_density=0.001):
|
||||||
cdef Beam beam
|
cdef Beam beam
|
||||||
cdef np.ndarray scores
|
cdef np.ndarray scores
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
cdef int nr_class = self.moves.n_moves
|
cdef int nr_class = self.moves.n_moves
|
||||||
cdef StateClass stcls, output
|
cdef StateClass stcls, output
|
||||||
tokvecs = self.model[0].ops.flatten(tokvecses)
|
|
||||||
if USE_FINE_TUNE:
|
|
||||||
tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
|
|
||||||
cuda_stream = get_cuda_stream()
|
cuda_stream = get_cuda_stream()
|
||||||
state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs,
|
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
|
||||||
cuda_stream, 0.0)
|
0.0)
|
||||||
beams = []
|
beams = []
|
||||||
cdef int offset = 0
|
cdef int offset = 0
|
||||||
cdef int j = 0
|
cdef int j = 0
|
||||||
|
@ -516,29 +515,24 @@ cdef class Parser:
|
||||||
free(scores)
|
free(scores)
|
||||||
free(token_ids)
|
free(token_ids)
|
||||||
|
|
||||||
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
|
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
||||||
if not any(self.moves.has_gold(gold) for gold in golds):
|
if not any(self.moves.has_gold(gold) for gold in golds):
|
||||||
return None
|
return None
|
||||||
if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5:
|
if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5:
|
||||||
return self.update_beam(docs_tokvecs, golds,
|
return self.update_beam(docs, golds,
|
||||||
self.cfg['beam_width'], self.cfg['beam_density'],
|
self.cfg['beam_width'], self.cfg['beam_density'],
|
||||||
drop=drop, sgd=sgd, losses=losses)
|
drop=drop, sgd=sgd, losses=losses)
|
||||||
if losses is not None and self.name not in losses:
|
if losses is not None and self.name not in losses:
|
||||||
losses[self.name] = 0.
|
losses[self.name] = 0.
|
||||||
docs, tokvec_lists = docs_tokvecs
|
|
||||||
tokvecs = self.model[0].ops.flatten(tokvec_lists)
|
|
||||||
if isinstance(docs, Doc) and isinstance(golds, GoldParse):
|
if isinstance(docs, Doc) and isinstance(golds, GoldParse):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
golds = [golds]
|
golds = [golds]
|
||||||
if USE_FINE_TUNE:
|
|
||||||
my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
|
|
||||||
tokvecs = self.model[0].ops.flatten(my_tokvecs)
|
|
||||||
|
|
||||||
cuda_stream = get_cuda_stream()
|
cuda_stream = get_cuda_stream()
|
||||||
|
|
||||||
states, golds, max_steps = self._init_gold_batch(docs, golds)
|
states, golds, max_steps = self._init_gold_batch(docs, golds)
|
||||||
state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream,
|
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
|
||||||
0.0)
|
drop)
|
||||||
todo = [(s, g) for (s, g) in zip(states, golds)
|
todo = [(s, g) for (s, g) in zip(states, golds)
|
||||||
if not s.is_final() and g is not None]
|
if not s.is_final() and g is not None]
|
||||||
if not todo:
|
if not todo:
|
||||||
|
@ -582,13 +576,9 @@ cdef class Parser:
|
||||||
if n_steps >= max_steps:
|
if n_steps >= max_steps:
|
||||||
break
|
break
|
||||||
self._make_updates(d_tokvecs,
|
self._make_updates(d_tokvecs,
|
||||||
backprops, sgd, cuda_stream)
|
bp_tokvecs, backprops, sgd, cuda_stream)
|
||||||
d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
|
|
||||||
if USE_FINE_TUNE:
|
|
||||||
d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
|
|
||||||
return d_tokvecs
|
|
||||||
|
|
||||||
def update_beam(self, docs_tokvecs, golds, width=None, density=None,
|
def update_beam(self, docs, golds, width=None, density=None,
|
||||||
drop=0., sgd=None, losses=None):
|
drop=0., sgd=None, losses=None):
|
||||||
if not any(self.moves.has_gold(gold) for gold in golds):
|
if not any(self.moves.has_gold(gold) for gold in golds):
|
||||||
return None
|
return None
|
||||||
|
@ -600,26 +590,20 @@ cdef class Parser:
|
||||||
density = self.cfg.get('beam_density', 0.0)
|
density = self.cfg.get('beam_density', 0.0)
|
||||||
if losses is not None and self.name not in losses:
|
if losses is not None and self.name not in losses:
|
||||||
losses[self.name] = 0.
|
losses[self.name] = 0.
|
||||||
docs, tokvecs = docs_tokvecs
|
|
||||||
lengths = [len(d) for d in docs]
|
lengths = [len(d) for d in docs]
|
||||||
assert min(lengths) >= 1
|
assert min(lengths) >= 1
|
||||||
tokvecs = self.model[0].ops.flatten(tokvecs)
|
|
||||||
if USE_FINE_TUNE:
|
|
||||||
my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
|
|
||||||
tokvecs += self.model[0].ops.flatten(my_tokvecs)
|
|
||||||
|
|
||||||
states = self.moves.init_batch(docs)
|
states = self.moves.init_batch(docs)
|
||||||
for gold in golds:
|
for gold in golds:
|
||||||
self.moves.preprocess_gold(gold)
|
self.moves.preprocess_gold(gold)
|
||||||
|
|
||||||
cuda_stream = get_cuda_stream()
|
cuda_stream = get_cuda_stream()
|
||||||
state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, 0.0)
|
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, drop)
|
||||||
|
|
||||||
states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500,
|
states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500,
|
||||||
states, tokvecs, golds,
|
states, golds,
|
||||||
state2vec, vec2scores,
|
state2vec, vec2scores,
|
||||||
width, density,
|
width, density,
|
||||||
sgd=sgd, drop=drop, losses=losses)
|
drop=drop, losses=losses)
|
||||||
backprop_lower = []
|
backprop_lower = []
|
||||||
cdef float batch_size = len(docs)
|
cdef float batch_size = len(docs)
|
||||||
for i, d_scores in enumerate(states_d_scores):
|
for i, d_scores in enumerate(states_d_scores):
|
||||||
|
@ -637,11 +621,7 @@ cdef class Parser:
|
||||||
else:
|
else:
|
||||||
backprop_lower.append((ids, d_vector, bp_vectors))
|
backprop_lower.append((ids, d_vector, bp_vectors))
|
||||||
d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
|
d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
|
||||||
self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream)
|
self._make_updates(d_tokvecs, bp_tokvecs, backprop_lower, sgd, cuda_stream)
|
||||||
d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, lengths)
|
|
||||||
if USE_FINE_TUNE:
|
|
||||||
d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
|
|
||||||
return d_tokvecs
|
|
||||||
|
|
||||||
def _init_gold_batch(self, whole_docs, whole_golds):
|
def _init_gold_batch(self, whole_docs, whole_golds):
|
||||||
"""Make a square batch, of length equal to the shortest doc. A long
|
"""Make a square batch, of length equal to the shortest doc. A long
|
||||||
|
@ -679,7 +659,7 @@ cdef class Parser:
|
||||||
max_moves = max(max_moves, len(oracle_actions))
|
max_moves = max(max_moves, len(oracle_actions))
|
||||||
return states, golds, max_moves
|
return states, golds, max_moves
|
||||||
|
|
||||||
def _make_updates(self, d_tokvecs, backprops, sgd, cuda_stream=None):
|
def _make_updates(self, d_tokvecs, bp_tokvecs, backprops, sgd, cuda_stream=None):
|
||||||
# Tells CUDA to block, so our async copies complete.
|
# Tells CUDA to block, so our async copies complete.
|
||||||
if cuda_stream is not None:
|
if cuda_stream is not None:
|
||||||
cuda_stream.synchronize()
|
cuda_stream.synchronize()
|
||||||
|
@ -690,6 +670,7 @@ cdef class Parser:
|
||||||
d_state_features *= mask.reshape(ids.shape + (1,))
|
d_state_features *= mask.reshape(ids.shape + (1,))
|
||||||
self.model[0].ops.scatter_add(d_tokvecs, ids * mask,
|
self.model[0].ops.scatter_add(d_tokvecs, ids * mask,
|
||||||
d_state_features)
|
d_state_features)
|
||||||
|
bp_tokvecs(d_tokvecs, sgd=sgd)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def move_names(self):
|
def move_names(self):
|
||||||
|
@ -699,11 +680,12 @@ cdef class Parser:
|
||||||
names.append(name)
|
names.append(name)
|
||||||
return names
|
return names
|
||||||
|
|
||||||
def get_batch_model(self, batch_size, tokvecs, stream, dropout):
|
def get_batch_model(self, docs, stream, dropout):
|
||||||
_, lower, upper = self.model
|
tok2vec, lower, upper = self.model
|
||||||
state2vec = precompute_hiddens(batch_size, tokvecs,
|
tokvecs, bp_tokvecs = tok2vec.begin_update(docs, drop=dropout)
|
||||||
lower, stream, drop=dropout)
|
state2vec = precompute_hiddens(len(docs), tokvecs,
|
||||||
return state2vec, upper
|
lower, stream, drop=0.0)
|
||||||
|
return (tokvecs, bp_tokvecs), state2vec, upper
|
||||||
|
|
||||||
nr_feature = 8
|
nr_feature = 8
|
||||||
|
|
||||||
|
@ -766,7 +748,7 @@ cdef class Parser:
|
||||||
# order, or the model goes out of synch
|
# order, or the model goes out of synch
|
||||||
self.cfg.setdefault('extra_labels', []).append(label)
|
self.cfg.setdefault('extra_labels', []).append(label)
|
||||||
|
|
||||||
def begin_training(self, gold_tuples, **cfg):
|
def begin_training(self, gold_tuples, pipeline=None, **cfg):
|
||||||
if 'model' in cfg:
|
if 'model' in cfg:
|
||||||
self.model = cfg['model']
|
self.model = cfg['model']
|
||||||
gold_tuples = nonproj.preprocess_training_data(gold_tuples)
|
gold_tuples = nonproj.preprocess_training_data(gold_tuples)
|
||||||
|
@ -775,9 +757,22 @@ cdef class Parser:
|
||||||
for label in labels:
|
for label in labels:
|
||||||
self.moves.add_action(action, label)
|
self.moves.add_action(action, label)
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
|
cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||||
self.model, cfg = self.Model(self.moves.n_moves, **cfg)
|
self.model, cfg = self.Model(self.moves.n_moves, **cfg)
|
||||||
|
self.init_multitask_objectives(gold_tuples, pipeline, **cfg)
|
||||||
|
link_vectors_to_models(self.vocab)
|
||||||
self.cfg.update(cfg)
|
self.cfg.update(cfg)
|
||||||
|
|
||||||
|
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
|
||||||
|
'''Setup models for secondary objectives, to benefit from multi-task
|
||||||
|
learning. This method is intended to be overridden by subclasses.
|
||||||
|
|
||||||
|
For instance, the dependency parser can benefit from sharing
|
||||||
|
an input representation with a label prediction model. These auxiliary
|
||||||
|
models are discarded after training.
|
||||||
|
'''
|
||||||
|
pass
|
||||||
|
|
||||||
def preprocess_gold(self, docs_golds):
|
def preprocess_gold(self, docs_golds):
|
||||||
for doc, gold in docs_golds:
|
for doc, gold in docs_golds:
|
||||||
yield doc, gold
|
yield doc, gold
|
||||||
|
@ -813,6 +808,7 @@ cdef class Parser:
|
||||||
if 'model' not in exclude:
|
if 'model' not in exclude:
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
|
self.cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||||
self.model, cfg = self.Model(**self.cfg)
|
self.model, cfg = self.Model(**self.cfg)
|
||||||
else:
|
else:
|
||||||
cfg = {}
|
cfg = {}
|
||||||
|
@ -835,7 +831,7 @@ cdef class Parser:
|
||||||
('upper_model', lambda: self.model[2].to_bytes()),
|
('upper_model', lambda: self.model[2].to_bytes()),
|
||||||
('vocab', lambda: self.vocab.to_bytes()),
|
('vocab', lambda: self.vocab.to_bytes()),
|
||||||
('moves', lambda: self.moves.to_bytes(strings=False)),
|
('moves', lambda: self.moves.to_bytes(strings=False)),
|
||||||
('cfg', lambda: ujson.dumps(self.cfg))
|
('cfg', lambda: json.dumps(self.cfg, indent=2, sort_keys=True))
|
||||||
))
|
))
|
||||||
if 'model' in exclude:
|
if 'model' in exclude:
|
||||||
exclude['tok2vec_model'] = True
|
exclude['tok2vec_model'] = True
|
||||||
|
@ -848,7 +844,7 @@ cdef class Parser:
|
||||||
deserializers = OrderedDict((
|
deserializers = OrderedDict((
|
||||||
('vocab', lambda b: self.vocab.from_bytes(b)),
|
('vocab', lambda b: self.vocab.from_bytes(b)),
|
||||||
('moves', lambda b: self.moves.from_bytes(b, strings=False)),
|
('moves', lambda b: self.moves.from_bytes(b, strings=False)),
|
||||||
('cfg', lambda b: self.cfg.update(ujson.loads(b))),
|
('cfg', lambda b: self.cfg.update(json.loads(b))),
|
||||||
('tok2vec_model', lambda b: None),
|
('tok2vec_model', lambda b: None),
|
||||||
('lower_model', lambda b: None),
|
('lower_model', lambda b: None),
|
||||||
('upper_model', lambda b: None)
|
('upper_model', lambda b: None)
|
||||||
|
@ -856,9 +852,11 @@ cdef class Parser:
|
||||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
||||||
if 'model' not in exclude:
|
if 'model' not in exclude:
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.model, cfg = self.Model(self.moves.n_moves)
|
self.model, cfg = self.Model(**self.cfg)
|
||||||
|
cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||||
else:
|
else:
|
||||||
cfg = {}
|
cfg = {}
|
||||||
|
cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||||
if 'tok2vec_model' in msg:
|
if 'tok2vec_model' in msg:
|
||||||
self.model[0].from_bytes(msg['tok2vec_model'])
|
self.model[0].from_bytes(msg['tok2vec_model'])
|
||||||
if 'lower_model' in msg:
|
if 'lower_model' in msg:
|
||||||
|
|
|
@ -148,7 +148,7 @@ cdef class TransitionSystem:
|
||||||
|
|
||||||
def add_action(self, int action, label_name):
|
def add_action(self, int action, label_name):
|
||||||
cdef attr_t label_id
|
cdef attr_t label_id
|
||||||
if not isinstance(label_name, int):
|
if not isinstance(label_name, (int, long)):
|
||||||
label_id = self.strings.add(label_name)
|
label_id = self.strings.add(label_name)
|
||||||
else:
|
else:
|
||||||
label_id = label_name
|
label_id = label_name
|
||||||
|
|
|
@ -12,7 +12,7 @@ from .. import util
|
||||||
|
|
||||||
|
|
||||||
_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'id',
|
_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'id',
|
||||||
'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx']
|
'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'th','xx']
|
||||||
_models = {'en': ['en_core_web_sm'],
|
_models = {'en': ['en_core_web_sm'],
|
||||||
'de': ['de_core_news_md'],
|
'de': ['de_core_news_md'],
|
||||||
'fr': ['fr_depvec_web_lg'],
|
'fr': ['fr_depvec_web_lg'],
|
||||||
|
@ -108,6 +108,11 @@ def he_tokenizer():
|
||||||
def nb_tokenizer():
|
def nb_tokenizer():
|
||||||
return util.get_lang_class('nb').Defaults.create_tokenizer()
|
return util.get_lang_class('nb').Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def th_tokenizer():
|
||||||
|
pythainlp = pytest.importorskip("pythainlp")
|
||||||
|
return util.get_lang_class('th').Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def stringstore():
|
def stringstore():
|
||||||
|
|
|
@ -67,12 +67,6 @@ def test_tokenizer_splits_uneven_wrap_interact(de_tokenizer, text):
|
||||||
assert len(tokens) == 4
|
assert len(tokens) == 4
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["blau-rot"])
|
|
||||||
def test_tokenizer_splits_hyphens(de_tokenizer, text):
|
|
||||||
tokens = de_tokenizer(text)
|
|
||||||
assert len(tokens) == 3
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"])
|
@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"])
|
||||||
def test_tokenizer_splits_numeric_range(de_tokenizer, text):
|
def test_tokenizer_splits_numeric_range(de_tokenizer, text):
|
||||||
tokens = de_tokenizer(text)
|
tokens = de_tokenizer(text)
|
||||||
|
@ -100,17 +94,21 @@ def test_tokenizer_splits_ellipsis_infix(de_tokenizer, text):
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ['Islam-Konferenz', 'Ost-West-Konflikt'])
|
||||||
|
def test_tokenizer_keeps_hyphens(de_tokenizer, text):
|
||||||
|
tokens = de_tokenizer(text)
|
||||||
|
assert len(tokens) == 1
|
||||||
|
|
||||||
|
|
||||||
def test_tokenizer_splits_double_hyphen_infix(de_tokenizer):
|
def test_tokenizer_splits_double_hyphen_infix(de_tokenizer):
|
||||||
tokens = de_tokenizer("Viele Regeln--wie die Bindestrich-Regeln--sind kompliziert.")
|
tokens = de_tokenizer("Viele Regeln--wie die Bindestrich-Regeln--sind kompliziert.")
|
||||||
assert len(tokens) == 12
|
assert len(tokens) == 10
|
||||||
assert tokens[0].text == "Viele"
|
assert tokens[0].text == "Viele"
|
||||||
assert tokens[1].text == "Regeln"
|
assert tokens[1].text == "Regeln"
|
||||||
assert tokens[2].text == "--"
|
assert tokens[2].text == "--"
|
||||||
assert tokens[3].text == "wie"
|
assert tokens[3].text == "wie"
|
||||||
assert tokens[4].text == "die"
|
assert tokens[4].text == "die"
|
||||||
assert tokens[5].text == "Bindestrich"
|
assert tokens[5].text == "Bindestrich-Regeln"
|
||||||
assert tokens[6].text == "-"
|
assert tokens[6].text == "--"
|
||||||
assert tokens[7].text == "Regeln"
|
assert tokens[7].text == "sind"
|
||||||
assert tokens[8].text == "--"
|
assert tokens[8].text == "kompliziert"
|
||||||
assert tokens[9].text == "sind"
|
|
||||||
assert tokens[10].text == "kompliziert"
|
|
||||||
|
|
|
@ -25,15 +25,15 @@ Umfang kläglich dünnen Beine flimmerten ihm hilflos vor den Augen.
|
||||||
assert len(tokens) == 109
|
assert len(tokens) == 109
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,length', [
|
@pytest.mark.parametrize('text', [
|
||||||
("Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten", 1),
|
"Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten",
|
||||||
("Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz", 1),
|
"Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz",
|
||||||
("Kraftfahrzeug-Haftpflichtversicherung", 3),
|
"Kraftfahrzeug-Haftpflichtversicherung",
|
||||||
("Vakuum-Mittelfrequenz-Induktionsofen", 5)
|
"Vakuum-Mittelfrequenz-Induktionsofen"
|
||||||
])
|
])
|
||||||
def test_tokenizer_handles_long_words(de_tokenizer, text, length):
|
def test_tokenizer_handles_long_words(de_tokenizer, text):
|
||||||
tokens = de_tokenizer(text)
|
tokens = de_tokenizer(text)
|
||||||
assert len(tokens) == length
|
assert len(tokens) == 1
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,length', [
|
@pytest.mark.parametrize('text,length', [
|
||||||
|
|
0
spacy/tests/lang/th/__init__.py
Normal file
0
spacy/tests/lang/th/__init__.py
Normal file
13
spacy/tests/lang/th/test_tokenizer.py
Normal file
13
spacy/tests/lang/th/test_tokenizer.py
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
TOKENIZER_TESTS = [
|
||||||
|
("คุณรักผมไหม", ['คุณ', 'รัก', 'ผม', 'ไหม'])
|
||||||
|
]
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS)
|
||||||
|
def test_thai_tokenizer(th_tokenizer, text, expected_tokens):
|
||||||
|
tokens = [token.text for token in th_tokenizer(text)]
|
||||||
|
assert tokens == expected_tokens
|
|
@ -26,7 +26,7 @@ def arc_eager(vocab):
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def tok2vec():
|
def tok2vec():
|
||||||
return Tok2Vec(8, 100, preprocess=doc2feats())
|
return Tok2Vec(8, 100)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -61,33 +61,22 @@ def test_predict_doc(parser, tok2vec, model, doc):
|
||||||
parser(doc)
|
parser(doc)
|
||||||
|
|
||||||
|
|
||||||
def test_update_doc(parser, tok2vec, model, doc, gold):
|
def test_update_doc(parser, model, doc, gold):
|
||||||
parser.model = model
|
parser.model = model
|
||||||
tokvecs, bp_tokvecs = tok2vec.begin_update([doc])
|
|
||||||
d_tokvecs = parser.update(([doc], tokvecs), [gold])
|
|
||||||
assert d_tokvecs[0].shape == tokvecs[0].shape
|
|
||||||
def optimize(weights, gradient, key=None):
|
def optimize(weights, gradient, key=None):
|
||||||
weights -= 0.001 * gradient
|
weights -= 0.001 * gradient
|
||||||
bp_tokvecs(d_tokvecs, sgd=optimize)
|
parser.update([doc], [gold], sgd=optimize)
|
||||||
assert d_tokvecs[0].sum() == 0.
|
|
||||||
|
|
||||||
|
|
||||||
def test_predict_doc_beam(parser, tok2vec, model, doc):
|
def test_predict_doc_beam(parser, model, doc):
|
||||||
doc.tensor = tok2vec([doc])[0]
|
|
||||||
parser.model = model
|
parser.model = model
|
||||||
parser(doc, beam_width=32, beam_density=0.001)
|
parser(doc, beam_width=32, beam_density=0.001)
|
||||||
for word in doc:
|
|
||||||
print(word.text, word.head, word.dep_)
|
|
||||||
|
|
||||||
|
|
||||||
def test_update_doc_beam(parser, tok2vec, model, doc, gold):
|
def test_update_doc_beam(parser, model, doc, gold):
|
||||||
parser.model = model
|
parser.model = model
|
||||||
tokvecs, bp_tokvecs = tok2vec.begin_update([doc])
|
|
||||||
d_tokvecs = parser.update_beam(([doc], tokvecs), [gold])
|
|
||||||
assert d_tokvecs[0].shape == tokvecs[0].shape
|
|
||||||
def optimize(weights, gradient, key=None):
|
def optimize(weights, gradient, key=None):
|
||||||
weights -= 0.001 * gradient
|
weights -= 0.001 * gradient
|
||||||
bp_tokvecs(d_tokvecs, sgd=optimize)
|
parser.update_beam([doc], [gold], sgd=optimize)
|
||||||
assert d_tokvecs[0].sum() == 0.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
8
spacy/tests/regression/test_issue1305.py
Normal file
8
spacy/tests/regression/test_issue1305.py
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
@pytest.mark.models('en')
|
||||||
|
def test_issue1305(EN):
|
||||||
|
'''Test lemmatization of English VBZ'''
|
||||||
|
assert EN.vocab.morphology.lemmatizer('works', 'verb') == set(['work'])
|
||||||
|
doc = EN(u'This app works well')
|
||||||
|
assert doc[2].lemma_ == 'work'
|
14
spacy/tests/regression/test_issue1380.py
Normal file
14
spacy/tests/regression/test_issue1380.py
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from ...language import Language
|
||||||
|
|
||||||
|
def test_issue1380_empty_string():
|
||||||
|
nlp = Language()
|
||||||
|
doc = nlp('')
|
||||||
|
assert len(doc) == 0
|
||||||
|
|
||||||
|
@pytest.mark.models('en')
|
||||||
|
def test_issue1380_en(EN):
|
||||||
|
doc = EN('')
|
||||||
|
assert len(doc) == 0
|
|
@ -9,11 +9,14 @@ import pytest
|
||||||
@pytest.mark.models('en')
|
@pytest.mark.models('en')
|
||||||
def test_issue429(EN):
|
def test_issue429(EN):
|
||||||
def merge_phrases(matcher, doc, i, matches):
|
def merge_phrases(matcher, doc, i, matches):
|
||||||
if i != len(matches) - 1:
|
if i != len(matches) - 1:
|
||||||
return None
|
return None
|
||||||
spans = [(ent_id, ent_id, doc[start:end]) for ent_id, start, end in matches]
|
spans = [(ent_id, ent_id, doc[start:end]) for ent_id, start, end in matches]
|
||||||
for ent_id, label, span in spans:
|
for ent_id, label, span in spans:
|
||||||
span.merge('NNP' if label else span.root.tag_, span.text, EN.vocab.strings[label])
|
span.merge(
|
||||||
|
tag=('NNP' if label else span.root.tag_),
|
||||||
|
lemma=span.text,
|
||||||
|
label='PERSON')
|
||||||
|
|
||||||
doc = EN('a')
|
doc = EN('a')
|
||||||
matcher = Matcher(EN.vocab)
|
matcher = Matcher(EN.vocab)
|
||||||
|
|
|
@ -11,7 +11,7 @@ import pytest
|
||||||
def taggers(en_vocab):
|
def taggers(en_vocab):
|
||||||
tagger1 = Tagger(en_vocab)
|
tagger1 = Tagger(en_vocab)
|
||||||
tagger2 = Tagger(en_vocab)
|
tagger2 = Tagger(en_vocab)
|
||||||
tagger1.model = tagger1.Model(8, 8)
|
tagger1.model = tagger1.Model(8)
|
||||||
tagger2.model = tagger1.model
|
tagger2.model = tagger1.model
|
||||||
return (tagger1, tagger2)
|
return (tagger1, tagger2)
|
||||||
|
|
||||||
|
|
|
@ -6,6 +6,16 @@ from ...strings import StringStore
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
def test_string_hash(stringstore):
|
||||||
|
'''Test that string hashing is stable across platforms'''
|
||||||
|
ss = stringstore
|
||||||
|
assert ss.add('apple') == 8566208034543834098
|
||||||
|
heart = '\U0001f499'
|
||||||
|
print(heart)
|
||||||
|
h = ss.add(heart)
|
||||||
|
assert h == 11841826740069053588
|
||||||
|
|
||||||
|
|
||||||
def test_stringstore_from_api_docs(stringstore):
|
def test_stringstore_from_api_docs(stringstore):
|
||||||
apple_hash = stringstore.add('apple')
|
apple_hash = stringstore.add('apple')
|
||||||
assert apple_hash == 8566208034543834098
|
assert apple_hash == 8566208034543834098
|
||||||
|
|
|
@ -34,7 +34,6 @@ def test_matcher_from_api_docs(en_vocab):
|
||||||
assert len(patterns[0])
|
assert len(patterns[0])
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_matcher_from_usage_docs(en_vocab):
|
def test_matcher_from_usage_docs(en_vocab):
|
||||||
text = "Wow 😀 This is really cool! 😂 😂"
|
text = "Wow 😀 This is really cool! 😂 😂"
|
||||||
doc = get_doc(en_vocab, words=text.split(' '))
|
doc = get_doc(en_vocab, words=text.split(' '))
|
||||||
|
@ -46,7 +45,8 @@ def test_matcher_from_usage_docs(en_vocab):
|
||||||
if doc.vocab.strings[match_id] == 'HAPPY':
|
if doc.vocab.strings[match_id] == 'HAPPY':
|
||||||
doc.sentiment += 0.1
|
doc.sentiment += 0.1
|
||||||
span = doc[start : end]
|
span = doc[start : end]
|
||||||
token = span.merge(norm='happy emoji')
|
token = span.merge()
|
||||||
|
token.vocab[token.text].norm_ = 'happy emoji'
|
||||||
|
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
matcher.add('HAPPY', label_sentiment, *pos_patterns)
|
matcher.add('HAPPY', label_sentiment, *pos_patterns)
|
||||||
|
@ -98,11 +98,11 @@ def test_matcher_match_multi(matcher):
|
||||||
(doc.vocab.strings['Java'], 5, 6)]
|
(doc.vocab.strings['Java'], 5, 6)]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_matcher_phrase_matcher(en_vocab):
|
def test_matcher_phrase_matcher(en_vocab):
|
||||||
words = ["Google", "Now"]
|
words = ["Google", "Now"]
|
||||||
doc = get_doc(en_vocab, words)
|
doc = get_doc(en_vocab, words)
|
||||||
matcher = PhraseMatcher(en_vocab, [doc])
|
matcher = PhraseMatcher(en_vocab)
|
||||||
|
matcher.add('COMPANY', None, doc)
|
||||||
words = ["I", "like", "Google", "Now", "best"]
|
words = ["I", "like", "Google", "Now", "best"]
|
||||||
doc = get_doc(en_vocab, words)
|
doc = get_doc(en_vocab, words)
|
||||||
assert len(matcher(doc)) == 1
|
assert len(matcher(doc)) == 1
|
||||||
|
|
|
@ -9,7 +9,8 @@ from .util import get_doc
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import pytest
|
import pytest
|
||||||
from thinc.neural import Maxout, Softmax
|
from thinc.neural._classes.maxout import Maxout
|
||||||
|
from thinc.neural._classes.softmax import Softmax
|
||||||
from thinc.api import chain
|
from thinc.api import chain
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import sys
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@ -37,9 +38,10 @@ def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
|
||||||
tokens = tokenizer(text)
|
tokens = tokenizer(text)
|
||||||
assert len(tokens) == length
|
assert len(tokens) == length
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8),
|
@pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8),
|
||||||
('i💙you', 3), ('🤘🤘yay!', 4)])
|
('i💙you', 3), ('🤘🤘yay!', 4)])
|
||||||
def test_tokenizer_handles_emoji(tokenizer, text, length):
|
def test_tokenizer_handles_emoji(tokenizer, text, length):
|
||||||
tokens = tokenizer(text)
|
# These break on narrow unicode builds, e.g. Windows
|
||||||
assert len(tokens) == length
|
if sys.maxunicode >= 1114111:
|
||||||
|
tokens = tokenizer(text)
|
||||||
|
assert len(tokens) == length
|
||||||
|
|
|
@ -54,7 +54,7 @@ cdef class Doc:
|
||||||
|
|
||||||
cdef public object noun_chunks_iterator
|
cdef public object noun_chunks_iterator
|
||||||
|
|
||||||
cdef int push_back(self, LexemeOrToken lex_or_tok, bint trailing_space) except -1
|
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1
|
||||||
|
|
||||||
cpdef np.ndarray to_array(self, object features)
|
cpdef np.ndarray to_array(self, object features)
|
||||||
|
|
||||||
|
|
|
@ -660,7 +660,7 @@ cdef class Doc:
|
||||||
"""
|
"""
|
||||||
with path.open('rb') as file_:
|
with path.open('rb') as file_:
|
||||||
bytes_data = file_.read()
|
bytes_data = file_.read()
|
||||||
self.from_bytes(bytes_data, **exclude)
|
return self.from_bytes(bytes_data, **exclude)
|
||||||
|
|
||||||
def to_bytes(self, **exclude):
|
def to_bytes(self, **exclude):
|
||||||
"""Serialize, i.e. export the document contents to a binary string.
|
"""Serialize, i.e. export the document contents to a binary string.
|
||||||
|
|
|
@ -3,7 +3,7 @@ from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import ujson
|
import ujson
|
||||||
import pip
|
import pkg_resources
|
||||||
import importlib
|
import importlib
|
||||||
import regex as re
|
import regex as re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -14,6 +14,7 @@ import numpy
|
||||||
import io
|
import io
|
||||||
import dill
|
import dill
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
from thinc.neural._classes.model import Model
|
||||||
|
|
||||||
import msgpack
|
import msgpack
|
||||||
import msgpack_numpy
|
import msgpack_numpy
|
||||||
|
@ -180,9 +181,10 @@ def is_package(name):
|
||||||
name (unicode): Name of package.
|
name (unicode): Name of package.
|
||||||
RETURNS (bool): True if installed package, False if not.
|
RETURNS (bool): True if installed package, False if not.
|
||||||
"""
|
"""
|
||||||
packages = pip.get_installed_distributions()
|
name = name.lower() # compare package name against lowercase name
|
||||||
|
packages = pkg_resources.working_set.by_key.keys()
|
||||||
for package in packages:
|
for package in packages:
|
||||||
if package.project_name.replace('-', '_') == name:
|
if package.lower().replace('-', '_') == name:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@ -193,6 +195,7 @@ def get_package_path(name):
|
||||||
name (unicode): Package name.
|
name (unicode): Package name.
|
||||||
RETURNS (Path): Path to installed package.
|
RETURNS (Path): Path to installed package.
|
||||||
"""
|
"""
|
||||||
|
name = name.lower() # use lowercase version to be safe
|
||||||
# Here we're importing the module just to find it. This is worryingly
|
# Here we're importing the module just to find it. This is worryingly
|
||||||
# indirect, but it's otherwise very difficult to find the package.
|
# indirect, but it's otherwise very difficult to find the package.
|
||||||
pkg = importlib.import_module(name)
|
pkg = importlib.import_module(name)
|
||||||
|
@ -557,3 +560,17 @@ def minify_html(html):
|
||||||
RETURNS (unicode): "Minified" HTML.
|
RETURNS (unicode): "Minified" HTML.
|
||||||
"""
|
"""
|
||||||
return html.strip().replace(' ', '').replace('\n', '')
|
return html.strip().replace(' ', '').replace('\n', '')
|
||||||
|
|
||||||
|
|
||||||
|
def use_gpu(gpu_id):
|
||||||
|
try:
|
||||||
|
import cupy.cuda.device
|
||||||
|
except ImportError:
|
||||||
|
return None
|
||||||
|
from thinc.neural.ops import CupyOps
|
||||||
|
device = cupy.cuda.device.Device(gpu_id)
|
||||||
|
device.use()
|
||||||
|
Model.ops = CupyOps()
|
||||||
|
Model.Ops = CupyOps
|
||||||
|
return device
|
||||||
|
|
||||||
|
|
|
@ -6,6 +6,8 @@ import msgpack
|
||||||
import msgpack_numpy
|
import msgpack_numpy
|
||||||
msgpack_numpy.patch()
|
msgpack_numpy.patch()
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
|
from thinc.neural.util import get_array_module
|
||||||
|
from thinc.neural._classes.model import Model
|
||||||
|
|
||||||
from .typedefs cimport attr_t
|
from .typedefs cimport attr_t
|
||||||
from .strings cimport StringStore
|
from .strings cimport StringStore
|
||||||
|
@ -14,15 +16,29 @@ from .compat import basestring_
|
||||||
|
|
||||||
|
|
||||||
cdef class Vectors:
|
cdef class Vectors:
|
||||||
'''Store, save and load word vectors.'''
|
'''Store, save and load word vectors.
|
||||||
|
|
||||||
|
Vectors data is kept in the vectors.data attribute, which should be an
|
||||||
|
instance of numpy.ndarray (for CPU vectors)
|
||||||
|
or cupy.ndarray (for GPU vectors).
|
||||||
|
|
||||||
|
vectors.key2row is a dictionary mapping word hashes to rows
|
||||||
|
in the vectors.data table. The array `vectors.keys` keeps
|
||||||
|
the keys in order, such that keys[vectors.key2row[key]] == key.
|
||||||
|
'''
|
||||||
cdef public object data
|
cdef public object data
|
||||||
cdef readonly StringStore strings
|
cdef readonly StringStore strings
|
||||||
cdef public object key2row
|
cdef public object key2row
|
||||||
cdef public object keys
|
cdef public object keys
|
||||||
cdef public int i
|
cdef public int i
|
||||||
|
|
||||||
def __init__(self, strings, data_or_width):
|
def __init__(self, strings, data_or_width=0):
|
||||||
self.strings = StringStore()
|
if isinstance(strings, StringStore):
|
||||||
|
self.strings = strings
|
||||||
|
else:
|
||||||
|
self.strings = StringStore()
|
||||||
|
for string in strings:
|
||||||
|
self.strings.add(string)
|
||||||
if isinstance(data_or_width, int):
|
if isinstance(data_or_width, int):
|
||||||
self.data = data = numpy.zeros((len(strings), data_or_width),
|
self.data = data = numpy.zeros((len(strings), data_or_width),
|
||||||
dtype='f')
|
dtype='f')
|
||||||
|
@ -31,12 +47,17 @@ cdef class Vectors:
|
||||||
self.i = 0
|
self.i = 0
|
||||||
self.data = data
|
self.data = data
|
||||||
self.key2row = {}
|
self.key2row = {}
|
||||||
self.keys = np.ndarray((self.data.shape[0],), dtype='uint64')
|
self.keys = np.ndarray((self.data.shape[0],), dtype='uint64')
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return (Vectors, (self.strings, self.data))
|
return (Vectors, (self.strings, self.data))
|
||||||
|
|
||||||
def __getitem__(self, key):
|
def __getitem__(self, key):
|
||||||
|
'''Get a vector by key. If key is a string, it is hashed
|
||||||
|
to an integer ID using the vectors.strings table.
|
||||||
|
|
||||||
|
If the integer key is not found in the table, a KeyError is raised.
|
||||||
|
'''
|
||||||
if isinstance(key, basestring):
|
if isinstance(key, basestring):
|
||||||
key = self.strings[key]
|
key = self.strings[key]
|
||||||
i = self.key2row[key]
|
i = self.key2row[key]
|
||||||
|
@ -46,23 +67,30 @@ cdef class Vectors:
|
||||||
return self.data[i]
|
return self.data[i]
|
||||||
|
|
||||||
def __setitem__(self, key, vector):
|
def __setitem__(self, key, vector):
|
||||||
|
'''Set a vector for the given key. If key is a string, it is hashed
|
||||||
|
to an integer ID using the vectors.strings table.
|
||||||
|
'''
|
||||||
if isinstance(key, basestring):
|
if isinstance(key, basestring):
|
||||||
key = self.strings.add(key)
|
key = self.strings.add(key)
|
||||||
i = self.key2row[key]
|
i = self.key2row[key]
|
||||||
self.data[i] = vector
|
self.data[i] = vector
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
|
'''Yield vectors from the table.'''
|
||||||
yield from self.data
|
yield from self.data
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
|
'''Return the number of vectors that have been assigned.'''
|
||||||
return self.i
|
return self.i
|
||||||
|
|
||||||
def __contains__(self, key):
|
def __contains__(self, key):
|
||||||
|
'''Check whether a key has a vector entry in the table.'''
|
||||||
if isinstance(key, basestring_):
|
if isinstance(key, basestring_):
|
||||||
key = self.strings[key]
|
key = self.strings[key]
|
||||||
return key in self.key2row
|
return key in self.key2row
|
||||||
|
|
||||||
def add(self, key, vector=None):
|
def add(self, key, vector=None):
|
||||||
|
'''Add a key to the table, optionally setting a vector value as well.'''
|
||||||
if isinstance(key, basestring_):
|
if isinstance(key, basestring_):
|
||||||
key = self.strings.add(key)
|
key = self.strings.add(key)
|
||||||
if key not in self.key2row:
|
if key not in self.key2row:
|
||||||
|
@ -80,7 +108,9 @@ cdef class Vectors:
|
||||||
return i
|
return i
|
||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
for i, string in enumerate(self.strings):
|
'''Iterate over (string key, vector) pairs, in order.'''
|
||||||
|
for i, key in enumerate(self.keys):
|
||||||
|
string = self.strings[key]
|
||||||
yield string, self.data[i]
|
yield string, self.data[i]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -118,9 +148,14 @@ cdef class Vectors:
|
||||||
self.data
|
self.data
|
||||||
|
|
||||||
def to_disk(self, path, **exclude):
|
def to_disk(self, path, **exclude):
|
||||||
|
xp = get_array_module(self.data)
|
||||||
|
if xp is numpy:
|
||||||
|
save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False)
|
||||||
|
else:
|
||||||
|
save_array = lambda arr, file_: xp.save(file_, arr)
|
||||||
serializers = OrderedDict((
|
serializers = OrderedDict((
|
||||||
('vectors', lambda p: numpy.save(p.open('wb'), self.data, allow_pickle=False)),
|
('vectors', lambda p: save_array(self.data, p.open('wb'))),
|
||||||
('keys', lambda p: numpy.save(p.open('wb'), self.keys, allow_pickle=False)),
|
('keys', lambda p: xp.save(p.open('wb'), self.keys))
|
||||||
))
|
))
|
||||||
return util.to_disk(path, serializers, exclude)
|
return util.to_disk(path, serializers, exclude)
|
||||||
|
|
||||||
|
@ -133,8 +168,9 @@ cdef class Vectors:
|
||||||
self.key2row[key] = i
|
self.key2row[key] = i
|
||||||
|
|
||||||
def load_vectors(path):
|
def load_vectors(path):
|
||||||
|
xp = Model.ops.xp
|
||||||
if path.exists():
|
if path.exists():
|
||||||
self.data = numpy.load(path)
|
self.data = xp.load(path)
|
||||||
|
|
||||||
serializers = OrderedDict((
|
serializers = OrderedDict((
|
||||||
('keys', load_keys),
|
('keys', load_keys),
|
||||||
|
|
|
@ -27,6 +27,7 @@ from .vectors import Vectors
|
||||||
from . import util
|
from . import util
|
||||||
from . import attrs
|
from . import attrs
|
||||||
from . import symbols
|
from . import symbols
|
||||||
|
from ._ml import link_vectors_to_models
|
||||||
|
|
||||||
|
|
||||||
cdef class Vocab:
|
cdef class Vocab:
|
||||||
|
@ -65,7 +66,7 @@ cdef class Vocab:
|
||||||
self.strings.add(name)
|
self.strings.add(name)
|
||||||
self.lex_attr_getters = lex_attr_getters
|
self.lex_attr_getters = lex_attr_getters
|
||||||
self.morphology = Morphology(self.strings, tag_map, lemmatizer)
|
self.morphology = Morphology(self.strings, tag_map, lemmatizer)
|
||||||
self.vectors = Vectors(self.strings, 300)
|
self.vectors = Vectors(self.strings)
|
||||||
|
|
||||||
property lang:
|
property lang:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -261,7 +262,7 @@ cdef class Vocab:
|
||||||
Words can be looked up by string or int ID.
|
Words can be looked up by string or int ID.
|
||||||
|
|
||||||
RETURNS:
|
RETURNS:
|
||||||
A word vector. Size and shape determed by the
|
A word vector. Size and shape determined by the
|
||||||
vocab.vectors instance. Usually, a numpy ndarray
|
vocab.vectors instance. Usually, a numpy ndarray
|
||||||
of shape (300,) and dtype float32.
|
of shape (300,) and dtype float32.
|
||||||
|
|
||||||
|
@ -323,6 +324,7 @@ cdef class Vocab:
|
||||||
self.lexemes_from_bytes(file_.read())
|
self.lexemes_from_bytes(file_.read())
|
||||||
if self.vectors is not None:
|
if self.vectors is not None:
|
||||||
self.vectors.from_disk(path, exclude='strings.json')
|
self.vectors.from_disk(path, exclude='strings.json')
|
||||||
|
link_vectors_to_models(self)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self, **exclude):
|
def to_bytes(self, **exclude):
|
||||||
|
@ -336,7 +338,7 @@ cdef class Vocab:
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
return self.vectors.to_bytes(exclude='strings.json')
|
return self.vectors.to_bytes(exclude='strings.json')
|
||||||
|
|
||||||
getters = OrderedDict((
|
getters = OrderedDict((
|
||||||
('strings', lambda: self.strings.to_bytes()),
|
('strings', lambda: self.strings.to_bytes()),
|
||||||
('lexemes', lambda: self.lexemes_to_bytes()),
|
('lexemes', lambda: self.lexemes_to_bytes()),
|
||||||
|
@ -436,6 +438,7 @@ def unpickle_vocab(sstore, morphology, data_dir,
|
||||||
vocab.lex_attr_getters = lex_attr_getters
|
vocab.lex_attr_getters = lex_attr_getters
|
||||||
vocab.lexemes_from_bytes(lexemes_data)
|
vocab.lexemes_from_bytes(lexemes_data)
|
||||||
vocab.length = length
|
vocab.length = length
|
||||||
|
link_vectors_to_models(vocab)
|
||||||
return vocab
|
return vocab
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -17,6 +17,7 @@ fi
|
||||||
|
|
||||||
if [ "${VIA}" == "compile" ]; then
|
if [ "${VIA}" == "compile" ]; then
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
|
python setup.py build_ext --inplace
|
||||||
pip install -e .
|
pip install -e .
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
|
@ -8,4 +8,5 @@ include _includes/_mixins
|
||||||
| does not exist!
|
| does not exist!
|
||||||
|
|
||||||
h2.c-landing__title.u-heading-3.u-padding-small
|
h2.c-landing__title.u-heading-3.u-padding-small
|
||||||
a(href="javascript:history.go(-1)") Click here to go back.
|
+button(false, true, "secondary-light")(href="javascript:history.go(-1)")
|
||||||
|
| Click here to go back
|
||||||
|
|
|
@ -3,24 +3,22 @@
|
||||||
"landing": true,
|
"landing": true,
|
||||||
"logos": [
|
"logos": [
|
||||||
{
|
{
|
||||||
"quora": [ "https://www.quora.com", 150 ],
|
"airbnb": [ "https://www.airbnb.com", 150, 45],
|
||||||
"chartbeat": [ "https://chartbeat.com", 200 ],
|
"quora": [ "https://www.quora.com", 120, 34 ],
|
||||||
"duedil": [ "https://www.duedil.com", 150 ],
|
"retriever": [ "https://www.retriever.no", 150, 33 ],
|
||||||
"stitchfix": [ "https://www.stitchfix.com", 190 ]
|
"stitchfix": [ "https://www.stitchfix.com", 150, 18 ]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"wayblazer": [ "http://wayblazer.com", 200 ],
|
"chartbeat": [ "https://chartbeat.com", 180, 25 ],
|
||||||
"indico": [ "https://indico.io", 150 ],
|
"allenai": [ "https://allenai.org", 220, 37 ]
|
||||||
"chattermill": [ "https://chattermill.io", 175 ],
|
}
|
||||||
"turi": [ "https://turi.com", 150 ],
|
],
|
||||||
"kip": [ "http://kipthis.com", 70 ]
|
"features": [
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"socrata": [ "https://www.socrata.com", 150 ],
|
"thoughtworks": ["https://www.thoughtworks.com/radar/tools", 150, 28],
|
||||||
"cytora": [ "http://www.cytora.com", 125 ],
|
"wapo": ["https://www.washingtonpost.com/news/wonk/wp/2016/05/18/googles-new-artificial-intelligence-cant-understand-these-sentences-can-you/", 100, 77],
|
||||||
"signaln": [ "http://signaln.com", 150 ],
|
"venturebeat": ["https://venturebeat.com/2017/01/27/4-ai-startups-that-analyze-customer-reviews/", 150, 19],
|
||||||
"wonderflow": [ "http://www.wonderflow.co", 200 ],
|
"microsoft": ["https://www.microsoft.com/developerblog/2016/09/13/training-a-classifier-for-relation-extraction-from-medical-literature/", 130, 28]
|
||||||
"synapsify": [ "http://www.gosynapsify.com", 150 ]
|
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
@ -34,7 +32,24 @@
|
||||||
"landing": true
|
"landing": true
|
||||||
},
|
},
|
||||||
|
|
||||||
"announcement" : {
|
"styleguide": {
|
||||||
"title": "Important Announcement"
|
"title": "Styleguide",
|
||||||
|
"sidebar": {
|
||||||
|
"Styleguide": { "": "styleguide" },
|
||||||
|
"Resources": {
|
||||||
|
"Website Source": "https://github.com/explosion/spacy/tree/master/website",
|
||||||
|
"Contributing Guide": "https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"menu": {
|
||||||
|
"Introduction": "intro",
|
||||||
|
"Logo": "logo",
|
||||||
|
"Colors": "colors",
|
||||||
|
"Typography": "typography",
|
||||||
|
"Elements": "elements",
|
||||||
|
"Components": "components",
|
||||||
|
"Embeds": "embeds",
|
||||||
|
"Markup Reference": "markup"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -11,12 +11,9 @@
|
||||||
"COMPANY": "Explosion AI",
|
"COMPANY": "Explosion AI",
|
||||||
"COMPANY_URL": "https://explosion.ai",
|
"COMPANY_URL": "https://explosion.ai",
|
||||||
"DEMOS_URL": "https://demos.explosion.ai",
|
"DEMOS_URL": "https://demos.explosion.ai",
|
||||||
|
"MODELS_REPO": "explosion/spacy-models",
|
||||||
|
|
||||||
"SPACY_VERSION": "1.8",
|
"SPACY_VERSION": "2.0",
|
||||||
"LATEST_NEWS": {
|
|
||||||
"url": "https://github.com/explosion/spaCy/releases/tag/v2.0.0-alpha",
|
|
||||||
"title": "Test spaCy v2.0.0 alpha!"
|
|
||||||
},
|
|
||||||
|
|
||||||
"SOCIAL": {
|
"SOCIAL": {
|
||||||
"twitter": "spacy_io",
|
"twitter": "spacy_io",
|
||||||
|
@ -27,25 +24,23 @@
|
||||||
},
|
},
|
||||||
|
|
||||||
"NAVIGATION": {
|
"NAVIGATION": {
|
||||||
"Home": "/",
|
"Usage": "/usage",
|
||||||
"Usage": "/docs/usage",
|
"Models": "/models",
|
||||||
"Reference": "/docs/api",
|
"API": "/api"
|
||||||
"Demos": "/docs/usage/showcase",
|
|
||||||
"Blog": "https://explosion.ai/blog"
|
|
||||||
},
|
},
|
||||||
|
|
||||||
"FOOTER": {
|
"FOOTER": {
|
||||||
"spaCy": {
|
"spaCy": {
|
||||||
"Usage": "/docs/usage",
|
"Usage": "/usage",
|
||||||
"API Reference": "/docs/api",
|
"Models": "/models",
|
||||||
"Tutorials": "/docs/usage/tutorials",
|
"API Reference": "/api",
|
||||||
"Showcase": "/docs/usage/showcase"
|
"Resources": "/usage/resources"
|
||||||
},
|
},
|
||||||
"Support": {
|
"Support": {
|
||||||
"Issue Tracker": "https://github.com/explosion/spaCy/issues",
|
"Issue Tracker": "https://github.com/explosion/spaCy/issues",
|
||||||
"StackOverflow": "http://stackoverflow.com/questions/tagged/spacy",
|
"StackOverflow": "http://stackoverflow.com/questions/tagged/spacy",
|
||||||
"Reddit usergroup": "https://www.reddit.com/r/spacynlp/",
|
"Reddit Usergroup": "https://www.reddit.com/r/spacynlp/",
|
||||||
"Gitter chat": "https://gitter.im/explosion/spaCy"
|
"Gitter Chat": "https://gitter.im/explosion/spaCy"
|
||||||
},
|
},
|
||||||
"Connect": {
|
"Connect": {
|
||||||
"Twitter": "https://twitter.com/spacy_io",
|
"Twitter": "https://twitter.com/spacy_io",
|
||||||
|
@ -74,21 +69,11 @@
|
||||||
{"id": "venv", "title": "virtualenv", "help": "Use a virtual environment and install spaCy into a user directory" },
|
{"id": "venv", "title": "virtualenv", "help": "Use a virtual environment and install spaCy into a user directory" },
|
||||||
{"id": "gpu", "title": "GPU", "help": "Run spaCy on GPU to make it faster. Requires an NVDIA graphics card with CUDA 2+. See section below for more info."}]
|
{"id": "gpu", "title": "GPU", "help": "Run spaCy on GPU to make it faster. Requires an NVDIA graphics card with CUDA 2+. See section below for more info."}]
|
||||||
},
|
},
|
||||||
{ "id": "model", "title": "Models", "multiple": true, "options": [
|
{ "id": "model", "title": "Models", "multiple": true }
|
||||||
{ "id": "en", "title": "English", "meta": "50MB" },
|
|
||||||
{ "id": "de", "title": "German", "meta": "645MB" },
|
|
||||||
{ "id": "fr", "title": "French", "meta": "1.33GB" },
|
|
||||||
{ "id": "es", "title": "Spanish", "meta": "377MB"}]
|
|
||||||
}
|
|
||||||
],
|
],
|
||||||
|
|
||||||
"QUICKSTART_MODELS": [
|
"QUICKSTART_MODELS": [
|
||||||
{ "id": "lang", "title": "Language", "options": [
|
{ "id": "lang", "title": "Language"},
|
||||||
{ "id": "en", "title": "English", "checked": true },
|
|
||||||
{ "id": "de", "title": "German" },
|
|
||||||
{ "id": "fr", "title": "French" },
|
|
||||||
{ "id": "es", "title": "Spanish" }]
|
|
||||||
},
|
|
||||||
{ "id": "load", "title": "Loading style", "options": [
|
{ "id": "load", "title": "Loading style", "options": [
|
||||||
{ "id": "spacy", "title": "Use spacy.load()", "checked": true, "help": "Use spaCy's built-in loader to load the model by name." },
|
{ "id": "spacy", "title": "Use spacy.load()", "checked": true, "help": "Use spaCy's built-in loader to load the model by name." },
|
||||||
{ "id": "module", "title": "Import as module", "help": "Import the model explicitly as a Python module." }]
|
{ "id": "module", "title": "Import as module", "help": "Import the model explicitly as a Python module." }]
|
||||||
|
@ -98,50 +83,15 @@
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|
||||||
"MODELS": {
|
|
||||||
"en": [
|
|
||||||
{ "id": "en_core_web_sm", "lang": "English", "feats": [1, 1, 1, 1], "size": "50 MB", "license": "CC BY-SA", "def": true },
|
|
||||||
{ "id": "en_core_web_md", "lang": "English", "feats": [1, 1, 1, 1], "size": "1 GB", "license": "CC BY-SA" },
|
|
||||||
{ "id": "en_depent_web_md", "lang": "English", "feats": [1, 1, 1, 0], "size": "328 MB", "license": "CC BY-SA" },
|
|
||||||
{ "id": "en_vectors_glove_md", "lang": "English", "feats": [1, 0, 0, 1], "size": "727 MB", "license": "CC BY-SA" }
|
|
||||||
],
|
|
||||||
"de": [
|
|
||||||
{ "id": "de_core_news_md", "lang": "German", "feats": [1, 1, 1, 1], "size": "645 MB", "license": "CC BY-SA" }
|
|
||||||
],
|
|
||||||
"fr": [
|
|
||||||
{ "id": "fr_depvec_web_lg", "lang": "French", "feats": [1, 1, 0, 1], "size": "1.33 GB", "license": "CC BY-NC" }
|
|
||||||
],
|
|
||||||
"es": [
|
|
||||||
{ "id": "es_core_web_md", "lang": "Spanish", "feats": [1, 1, 1, 1], "size": "377 MB", "license": "CC BY-SA"}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
|
|
||||||
"EXAMPLE_SENTENCES": {
|
|
||||||
"en": "This is a sentence.",
|
|
||||||
"de": "Dies ist ein Satz.",
|
|
||||||
"fr": "C'est une phrase.",
|
|
||||||
"es": "Esto es una frase."
|
|
||||||
},
|
|
||||||
|
|
||||||
"ALPHA": true,
|
"ALPHA": true,
|
||||||
"V_CSS": "1.6",
|
"V_CSS": "2.0",
|
||||||
"V_JS": "1.2",
|
"V_JS": "2.0",
|
||||||
"DEFAULT_SYNTAX": "python",
|
"DEFAULT_SYNTAX": "python",
|
||||||
"ANALYTICS": "UA-58931649-1",
|
"ANALYTICS": "UA-58931649-1",
|
||||||
"MAILCHIMP": {
|
"MAILCHIMP": {
|
||||||
"user": "spacy.us12",
|
"user": "spacy.us12",
|
||||||
"id": "83b0498b1e7fa3c91ce68c3f1",
|
"id": "83b0498b1e7fa3c91ce68c3f1",
|
||||||
"list": "89ad33e698"
|
"list": "89ad33e698"
|
||||||
},
|
|
||||||
"BADGES": {
|
|
||||||
"pipy": {
|
|
||||||
"badge": "https://img.shields.io/pypi/v/spacy.svg?style=flat-square",
|
|
||||||
"link": "https://pypi.python.org/pypi/spacy"
|
|
||||||
},
|
|
||||||
"conda": {
|
|
||||||
"badge": "https://anaconda.org/conda-forge/spacy/badges/version.svg",
|
|
||||||
"link": "https://anaconda.org/conda-forge/spacy"
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
//- 💫 INCLUDES > FOOTER
|
//- 💫 INCLUDES > FOOTER
|
||||||
|
|
||||||
include _mixins
|
footer.o-footer.u-text
|
||||||
|
|
||||||
footer.o-footer.u-text.u-border-dotted
|
|
||||||
+grid.o-content
|
+grid.o-content
|
||||||
each group, label in FOOTER
|
each group, label in FOOTER
|
||||||
+grid-col("quarter")
|
+grid-col("quarter")
|
||||||
|
@ -13,18 +11,18 @@ footer.o-footer.u-text.u-border-dotted
|
||||||
li
|
li
|
||||||
+a(url)=item
|
+a(url)=item
|
||||||
|
|
||||||
if SECTION != "docs"
|
if SECTION == "index"
|
||||||
+grid-col("quarter")
|
+grid-col("quarter")
|
||||||
include _newsletter
|
include _newsletter
|
||||||
|
|
||||||
if SECTION == "docs"
|
if SECTION != "index"
|
||||||
.o-content.o-block.u-border-dotted
|
.o-content.o-block.u-border-dotted
|
||||||
include _newsletter
|
include _newsletter
|
||||||
|
|
||||||
.o-inline-list.u-text-center.u-text-tiny.u-color-subtle
|
.o-inline-list.u-text-center.u-text-tiny.u-color-subtle
|
||||||
span © 2016-#{new Date().getFullYear()} #[+a(COMPANY_URL, true)=COMPANY]
|
span © 2016-#{new Date().getFullYear()} #[+a(COMPANY_URL, true)=COMPANY]
|
||||||
|
|
||||||
+a(COMPANY_URL, true)
|
+a(COMPANY_URL, true)(aria-label="Explosion AI")
|
||||||
+svg("graphics", "explosion", 45).o-icon.u-color-theme.u-grayscale
|
+icon("explosion", 45).o-icon.u-color-theme.u-grayscale
|
||||||
|
|
||||||
+a(COMPANY_URL + "/legal", true) Legal / Imprint
|
+a(COMPANY_URL + "/legal", true) Legal / Imprint
|
||||||
|
|
|
@ -1,35 +1,71 @@
|
||||||
//- 💫 INCLUDES > FUNCTIONS
|
//- 💫 INCLUDES > FUNCTIONS
|
||||||
|
|
||||||
//- More descriptive variables for current.path and current.source
|
//- Descriptive variables, available in the global scope
|
||||||
|
|
||||||
- CURRENT = current.source
|
- CURRENT = current.source
|
||||||
- SECTION = current.path[0]
|
- SECTION = current.path[0]
|
||||||
- SUBSECTION = current.path[1]
|
- LANGUAGES = public.models._data.LANGUAGES
|
||||||
|
- MODELS = public.models._data.MODELS
|
||||||
|
- CURRENT_MODELS = MODELS[current.source] || []
|
||||||
|
|
||||||
|
- MODEL_COUNT = Object.keys(MODELS).map(m => Object.keys(MODELS[m]).length).reduce((a, b) => a + b)
|
||||||
|
- MODEL_LANG_COUNT = Object.keys(MODELS).length
|
||||||
|
- LANG_COUNT = Object.keys(LANGUAGES).length
|
||||||
|
|
||||||
|
- MODEL_META = public.models._data.MODEL_META
|
||||||
|
- MODEL_LICENSES = public.models._data.MODEL_LICENSES
|
||||||
|
- MODEL_ACCURACY = public.models._data.MODEL_ACCURACY
|
||||||
|
- EXAMPLE_SENTENCES = public.models._data.EXAMPLE_SENTENCES
|
||||||
|
|
||||||
|
- IS_PAGE = (SECTION != "index") && !landing
|
||||||
|
- IS_MODELS = (SECTION == "models" && LANGUAGES[current.source])
|
||||||
|
- HAS_MODELS = IS_MODELS && CURRENT_MODELS.length
|
||||||
|
|
||||||
|
|
||||||
//- Add prefixes to items of an array (for modifier CSS classes)
|
//- Add prefixes to items of an array (for modifier CSS classes)
|
||||||
|
array - [array] list of class names or options, e.g. ["foot"]
|
||||||
|
prefix - [string] prefix to add to each class, e.g. "c-table__row"
|
||||||
|
RETURNS - [array] list of modified class names
|
||||||
|
|
||||||
- function prefixArgs(array, prefix) {
|
- function prefixArgs(array, prefix) {
|
||||||
- return array.map(function(arg) {
|
- return array.map(arg => prefix + '--' + arg).join(' ');
|
||||||
- return prefix + '--' + arg;
|
- }
|
||||||
- }).join(' ');
|
|
||||||
|
|
||||||
|
//- Convert API paths (semi-temporary fix for renamed sections)
|
||||||
|
path - [string] link path supplied to +api mixin
|
||||||
|
RETURNS - [string] new link path to correct location
|
||||||
|
|
||||||
|
- function convertAPIPath(path) {
|
||||||
|
- if (path.startsWith('spacy#') || path.startsWith('displacy#') || path.startsWith('util#')) {
|
||||||
|
- var comps = path.split('#');
|
||||||
|
- return "top-level#" + comps[0] + '.' + comps[1];
|
||||||
|
- }
|
||||||
|
- else if (path.startsWith('cli#')) {
|
||||||
|
- return "top-level#" + path.split('#')[1];
|
||||||
|
- }
|
||||||
|
- return path;
|
||||||
|
- }
|
||||||
|
|
||||||
|
|
||||||
|
//- Get model components from ID. Components can then be looked up in LANGUAGES
|
||||||
|
and MODEL_META respectively, to get their human-readable form.
|
||||||
|
id - [string] model ID, e.g. "en_core_web_sm"
|
||||||
|
RETURNS - [object] object keyed by components lang, type, genre and size
|
||||||
|
|
||||||
|
- function getModelComponents(id) {
|
||||||
|
- var comps = id.split('_');
|
||||||
|
- return {'lang': comps[0], 'type': comps[1], 'genre': comps[2], 'size': comps[3]}
|
||||||
- }
|
- }
|
||||||
|
|
||||||
|
|
||||||
//- Generate GitHub links
|
//- Generate GitHub links
|
||||||
|
repo - [string] name of repo owned by explosion
|
||||||
|
filepath - [string] logical path to file relative to repository root
|
||||||
|
branch - [string] optional branch, defaults to "master"
|
||||||
|
RETURNS - [string] the correct link to the file on GitHub
|
||||||
|
|
||||||
- function gh(repo, filepath, branch) {
|
- function gh(repo, filepath, branch) {
|
||||||
- var branch = ALPHA ? 'develop' : branch
|
- var branch = ALPHA ? 'develop' : branch
|
||||||
- return 'https://github.com/' + SOCIAL.github + '/' + repo + (filepath ? '/blob/' + (branch || 'master') + '/' + filepath : '' );
|
- return 'https://github.com/' + SOCIAL.github + '/' + (repo || '') + (filepath ? '/blob/' + (branch || 'master') + '/' + filepath : '' );
|
||||||
- }
|
|
||||||
|
|
||||||
|
|
||||||
//- Get social images
|
|
||||||
|
|
||||||
- function getSocialImg() {
|
|
||||||
- var base = SITE_URL + '/assets/img/social/preview_'
|
|
||||||
- var image = ALPHA ? 'alpha' : 'default'
|
|
||||||
- if (preview) image = preview
|
|
||||||
- else if (SECTION == 'docs' && !ALPHA) image = 'docs'
|
|
||||||
- return base + image + '.jpg'
|
|
||||||
- }
|
- }
|
||||||
|
|
|
@ -1,5 +1,13 @@
|
||||||
//- 💫 MIXINS > BASE
|
//- 💫 MIXINS > BASE
|
||||||
|
|
||||||
|
//- Section
|
||||||
|
id - [string] anchor assigned to section (used for breadcrumb navigation)
|
||||||
|
|
||||||
|
mixin section(id)
|
||||||
|
section.o-section(id="section-" + id data-section=id)
|
||||||
|
block
|
||||||
|
|
||||||
|
|
||||||
//- Aside wrapper
|
//- Aside wrapper
|
||||||
label - [string] aside label
|
label - [string] aside label
|
||||||
|
|
||||||
|
@ -11,34 +19,26 @@ mixin aside-wrapper(label)
|
||||||
|
|
||||||
block
|
block
|
||||||
|
|
||||||
//- Date
|
|
||||||
input - [string] date in the format YYYY-MM-DD
|
|
||||||
|
|
||||||
mixin date(input)
|
//- SVG from map (uses embedded SVG sprite)
|
||||||
- var date = new Date(input)
|
|
||||||
- var months = [ 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December' ]
|
|
||||||
|
|
||||||
time(datetime=JSON.parse(JSON.stringify(date)))&attributes(attributes)=months[date.getMonth()] + ' ' + date.getDate() + ', ' + date.getFullYear()
|
|
||||||
|
|
||||||
|
|
||||||
//- SVG from map
|
|
||||||
file - [string] SVG file name in /assets/img/
|
|
||||||
name - [string] SVG symbol id
|
name - [string] SVG symbol id
|
||||||
width - [integer] width in px
|
width - [integer] width in px
|
||||||
height - [integer] height in px (default: same as width)
|
height - [integer] height in px (default: same as width)
|
||||||
|
|
||||||
mixin svg(file, name, width, height)
|
mixin svg(name, width, height)
|
||||||
svg(aria-hidden="true" viewBox="0 0 #{width} #{height || width}" width=width height=(height || width))&attributes(attributes)
|
svg(aria-hidden="true" viewBox="0 0 #{width} #{height || width}" width=width height=(height || width))&attributes(attributes)
|
||||||
use(xlink:href="/assets/img/#{file}.svg##{name}")
|
use(xlink:href="#svg_#{name}")
|
||||||
|
|
||||||
|
|
||||||
//- Icon
|
//- Icon
|
||||||
name - [string] icon name, should be SVG symbol ID
|
name - [string] icon name (will be used as symbol id: #svg_{name})
|
||||||
size - [integer] icon width and height (default: 20)
|
width - [integer] icon width (default: 20)
|
||||||
|
height - [integer] icon height (defaults to width)
|
||||||
|
|
||||||
mixin icon(name, size)
|
mixin icon(name, width, height)
|
||||||
- var size = size || 20
|
- var width = width || 20
|
||||||
+svg("icons", name, size).o-icon(style="min-width: #{size}px")&attributes(attributes)
|
- var height = height || width
|
||||||
|
+svg(name, width, height).o-icon(style="min-width: #{width}px")&attributes(attributes)
|
||||||
|
|
||||||
|
|
||||||
//- Pro/Con/Neutral icon
|
//- Pro/Con/Neutral icon
|
||||||
|
@ -46,8 +46,8 @@ mixin icon(name, size)
|
||||||
size - [integer] icon size (optional)
|
size - [integer] icon size (optional)
|
||||||
|
|
||||||
mixin procon(icon, size)
|
mixin procon(icon, size)
|
||||||
- colors = { pro: "green", con: "red", neutral: "yellow" }
|
- colors = { pro: "green", con: "red", neutral: "subtle" }
|
||||||
+icon(icon, size)(class="u-color-#{colors[icon] || 'subtle'}" aria-label=icon)&attributes(attributes)
|
+icon("circle", size || 16)(class="u-color-#{colors[icon] || 'subtle'}" aria-label=icon)&attributes(attributes)
|
||||||
|
|
||||||
|
|
||||||
//- Headlines Helper Mixin
|
//- Headlines Helper Mixin
|
||||||
|
@ -80,8 +80,7 @@ mixin headline(level)
|
||||||
|
|
||||||
mixin permalink(id)
|
mixin permalink(id)
|
||||||
if id
|
if id
|
||||||
a.u-permalink(id=id href="##{id}")
|
a.u-permalink(href="##{id}")
|
||||||
+icon("anchor").u-permalink__icon
|
|
||||||
block
|
block
|
||||||
|
|
||||||
else
|
else
|
||||||
|
@ -109,7 +108,7 @@ mixin quickstart(groups, headline, description, hide_results)
|
||||||
.c-quickstart__fields
|
.c-quickstart__fields
|
||||||
for option in group.options
|
for option in group.options
|
||||||
input.c-quickstart__input(class="c-quickstart__input--" + (group.input_style ? group.input_style : group.multiple ? "check" : "radio") type=group.multiple ? "checkbox" : "radio" name=group.id id="qs-#{option.id}" value=option.id checked=option.checked)
|
input.c-quickstart__input(class="c-quickstart__input--" + (group.input_style ? group.input_style : group.multiple ? "check" : "radio") type=group.multiple ? "checkbox" : "radio" name=group.id id="qs-#{option.id}" value=option.id checked=option.checked)
|
||||||
label.c-quickstart__label(for="qs-#{option.id}")!=option.title
|
label.c-quickstart__label.u-text-tiny(for="qs-#{option.id}")!=option.title
|
||||||
if option.meta
|
if option.meta
|
||||||
| #[span.c-quickstart__label__meta (#{option.meta})]
|
| #[span.c-quickstart__label__meta (#{option.meta})]
|
||||||
if option.help
|
if option.help
|
||||||
|
@ -122,12 +121,10 @@ mixin quickstart(groups, headline, description, hide_results)
|
||||||
code.c-code-block__content.c-quickstart__code(data-qs-results="")
|
code.c-code-block__content.c-quickstart__code(data-qs-results="")
|
||||||
block
|
block
|
||||||
|
|
||||||
.c-quickstart__info.u-text-tiny.o-block.u-text-right
|
|
||||||
| Like this widget? Check out #[+a("https://github.com/ines/quickstart").u-link quickstart.js]!
|
|
||||||
|
|
||||||
|
|
||||||
//- Quickstart code item
|
//- Quickstart code item
|
||||||
data [object] - Rendering conditions (keyed by option group ID, value: option)
|
data - [object] Rendering conditions (keyed by option group ID, value: option)
|
||||||
|
style - [string] modifier ID for line style
|
||||||
|
|
||||||
mixin qs(data, style)
|
mixin qs(data, style)
|
||||||
- args = {}
|
- args = {}
|
||||||
|
@ -148,6 +145,13 @@ mixin terminal(label)
|
||||||
+code.x-terminal__code
|
+code.x-terminal__code
|
||||||
block
|
block
|
||||||
|
|
||||||
|
//- Chart.js
|
||||||
|
id - [string] chart ID, will be assigned as #chart_{id}
|
||||||
|
|
||||||
|
mixin chart(id)
|
||||||
|
figure.o-block&attributes(attributes)
|
||||||
|
canvas(id="chart_#{id}" width="800" height="400" style="max-width: 100%")
|
||||||
|
|
||||||
|
|
||||||
//- Gitter chat button and widget
|
//- Gitter chat button and widget
|
||||||
button - [string] text shown on button
|
button - [string] text shown on button
|
||||||
|
@ -156,26 +160,24 @@ mixin terminal(label)
|
||||||
mixin gitter(button, label)
|
mixin gitter(button, label)
|
||||||
aside.js-gitter.c-chat.is-collapsed(data-title=(label || button))
|
aside.js-gitter.c-chat.is-collapsed(data-title=(label || button))
|
||||||
|
|
||||||
button.js-gitter-button.c-chat__button.u-text-small
|
button.js-gitter-button.c-chat__button.u-text-tag
|
||||||
+icon("chat").o-icon--inline
|
+icon("chat", 16).o-icon--inline
|
||||||
!=button
|
!=button
|
||||||
|
|
||||||
|
|
||||||
//- Badge
|
//- Badge
|
||||||
name - [string] "pipy" or "conda"
|
image - [string] path to badge image
|
||||||
|
url - [string] badge link
|
||||||
|
|
||||||
mixin badge(name)
|
mixin badge(image, url)
|
||||||
- site = BADGES[name]
|
+a(url).u-padding-small.u-hide-link&attributes(attributes)
|
||||||
|
img.o-badge(src=image alt=url height="20")
|
||||||
if site
|
|
||||||
+a(site.link).u-padding-small
|
|
||||||
img(src=site.badge alt="{name} version" height="20")
|
|
||||||
|
|
||||||
|
|
||||||
//- Logo
|
//- spaCy logo
|
||||||
|
|
||||||
mixin logo()
|
mixin logo()
|
||||||
+svg("graphics", "spacy", 675, 215).o-logo&attributes(attributes)
|
+svg("spacy", 675, 215).o-logo&attributes(attributes)
|
||||||
|
|
||||||
|
|
||||||
//- Landing
|
//- Landing
|
||||||
|
@ -186,18 +188,56 @@ mixin landing-header()
|
||||||
.c-landing__content
|
.c-landing__content
|
||||||
block
|
block
|
||||||
|
|
||||||
|
mixin landing-banner(headline, label)
|
||||||
|
.c-landing__banner.u-padding.o-block.u-color-light
|
||||||
|
+grid.c-landing__banner__content.o-no-block
|
||||||
|
+grid-col("third")
|
||||||
|
h3.u-heading.u-heading-1
|
||||||
|
if label
|
||||||
|
div
|
||||||
|
span.u-text-label.u-text-label--light=label
|
||||||
|
!=headline
|
||||||
|
|
||||||
mixin landing-badge(url, graphic, alt, size)
|
+grid-col("two-thirds").c-landing__banner__text
|
||||||
+a(url)(aria-label=alt title=alt).c-landing__badge
|
block
|
||||||
+svg("graphics", graphic, size || 225)
|
|
||||||
|
|
||||||
|
mixin landing-logos(title, logos)
|
||||||
|
.o-content.u-text-center&attributes(attributes)
|
||||||
|
h3.u-heading.u-text-label.u-color-dark=title
|
||||||
|
|
||||||
|
each row, i in logos
|
||||||
|
- var is_last = i == logos.length - 1
|
||||||
|
+grid("center").o-inline-list.o-no-block(class=is_last ? "o-no-block" : null)
|
||||||
|
each details, name in row
|
||||||
|
+a(details[0]).u-padding-medium
|
||||||
|
+icon(name, details[1], details[2])
|
||||||
|
|
||||||
|
if is_last
|
||||||
|
block
|
||||||
|
|
||||||
|
|
||||||
//- Under construction (temporary)
|
//- Under construction (temporary)
|
||||||
Marks sections that still need to be completed for the v2.0 release.
|
Marks sections that still need to be completed for the v2.0 release.
|
||||||
|
|
||||||
mixin under-construction()
|
mixin under-construction()
|
||||||
+infobox("🚧 Under construction")
|
+infobox("Under construction", "🚧")
|
||||||
| This section is still being written and will be updated for the v2.0
|
| This section is still being written and will be updated for the v2.0
|
||||||
| release. Is there anything that you think should definitely mentioned or
|
| release. Is there anything that you think should definitely mentioned or
|
||||||
| explained here? Any examples you'd like to see? #[strong Let us know]
|
| explained here? Any examples you'd like to see? #[strong Let us know]
|
||||||
| on the #[+a(gh("spacy") + "/issues/1105") v2.0 alpha thread] on GitHub!
|
| on the #[+a(gh("spacy") + "/issues/1105") v2.0 alpha thread] on GitHub!
|
||||||
|
|
||||||
|
|
||||||
|
//- Alpha infobox (temporary)
|
||||||
|
Added in the templates to notify user that they're visiting the alpha site.
|
||||||
|
|
||||||
|
mixin alpha-info()
|
||||||
|
+infobox("You are viewing the spaCy v2.0.0 alpha docs", "⚠️")
|
||||||
|
strong This page is part of the alpha documentation for spaCy v2.0.
|
||||||
|
| It does not reflect the state of the latest stable release.
|
||||||
|
| Because v2.0 is still under development, the implementation
|
||||||
|
| may differ from the intended state described here. See the
|
||||||
|
| #[+a(gh("spaCy") + "/releases/tag/v2.0.0-alpha") release notes]
|
||||||
|
| for details on how to install and test the new version. To
|
||||||
|
| read the official docs for spaCy v1.x,
|
||||||
|
| #[+a("https://spacy.io/docs") go here].
|
||||||
|
|
|
@ -8,11 +8,15 @@ include _mixins-base
|
||||||
level - [integer] headline level, corresponds to h1, h2, h3 etc.
|
level - [integer] headline level, corresponds to h1, h2, h3 etc.
|
||||||
id - [string] unique identifier, creates permalink (optional)
|
id - [string] unique identifier, creates permalink (optional)
|
||||||
|
|
||||||
mixin h(level, id)
|
mixin h(level, id, source)
|
||||||
+headline(level).u-heading&attributes(attributes)
|
+headline(level).u-heading(id=id)&attributes(attributes)
|
||||||
+permalink(id)
|
+permalink(id)
|
||||||
block
|
block
|
||||||
|
|
||||||
|
if source
|
||||||
|
+button(gh("spacy", source), false, "secondary", "small").u-nowrap.u-float-right
|
||||||
|
span Source #[+icon("code", 14).o-icon--inline]
|
||||||
|
|
||||||
|
|
||||||
//- External links
|
//- External links
|
||||||
url - [string] link href
|
url - [string] link href
|
||||||
|
@ -38,21 +42,23 @@ mixin src(url)
|
||||||
|
|
||||||
|
|
||||||
//- API link (with added tag and automatically generated path)
|
//- API link (with added tag and automatically generated path)
|
||||||
path - [string] path to API docs page relative to /docs/api/
|
path - [string] path to API docs page relative to /api/
|
||||||
|
|
||||||
mixin api(path)
|
mixin api(path)
|
||||||
+a("/docs/api/" + path, true)(target="_self").u-no-border.u-inline-block.u-nowrap
|
- path = convertAPIPath(path)
|
||||||
|
+a("/api/" + path, true)(target="_self").u-no-border.u-inline-block.u-nowrap
|
||||||
block
|
block
|
||||||
|
|
||||||
| #[+icon("book", 18).o-icon--inline.u-color-theme]
|
| #[+icon("book", 16).o-icon--inline.u-color-theme]
|
||||||
|
|
||||||
|
|
||||||
//- Help icon with tooltip
|
//- Help icon with tooltip
|
||||||
tooltip - [string] Tooltip text
|
tooltip - [string] Tooltip text
|
||||||
|
icon_size - [integer] Optional size of help icon in px.
|
||||||
|
|
||||||
mixin help(tooltip)
|
mixin help(tooltip, icon_size)
|
||||||
span(data-tooltip=tooltip)&attributes(attributes)
|
span(data-tooltip=tooltip)&attributes(attributes)
|
||||||
+icon("help", 16).i-icon--inline
|
+icon("help", icon_size || 16).o-icon--inline
|
||||||
|
|
||||||
|
|
||||||
//- Aside for text
|
//- Aside for text
|
||||||
|
@ -68,24 +74,43 @@ mixin aside(label)
|
||||||
label - [string] aside title (optional or false for no label)
|
label - [string] aside title (optional or false for no label)
|
||||||
language - [string] language for syntax highlighting (default: "python")
|
language - [string] language for syntax highlighting (default: "python")
|
||||||
supports basic relevant languages available for PrismJS
|
supports basic relevant languages available for PrismJS
|
||||||
|
prompt - [string] prompt displayed before first line, e.g. "$"
|
||||||
|
|
||||||
mixin aside-code(label, language)
|
mixin aside-code(label, language, prompt)
|
||||||
+aside-wrapper(label)
|
+aside-wrapper(label)
|
||||||
+code(false, language).o-no-block
|
+code(false, language, prompt).o-no-block
|
||||||
block
|
block
|
||||||
|
|
||||||
|
|
||||||
//- Infobox
|
//- Infobox
|
||||||
label - [string] infobox title (optional or false for no title)
|
label - [string] infobox title (optional or false for no title)
|
||||||
|
emoji - [string] optional emoji displayed before the title, necessary as
|
||||||
|
argument to be able to wrap it for spacing
|
||||||
|
|
||||||
mixin infobox(label)
|
mixin infobox(label, emoji)
|
||||||
aside.o-box.o-block.u-text-small
|
aside.o-box.o-block.u-text-small
|
||||||
if label
|
if label
|
||||||
h3.u-text-label.u-color-theme=label
|
h3.u-heading.u-text-label.u-color-theme
|
||||||
|
if emoji
|
||||||
|
span.o-emoji=emoji
|
||||||
|
| #{label}
|
||||||
|
|
||||||
block
|
block
|
||||||
|
|
||||||
|
|
||||||
|
//- Logos displayed in the top corner of some infoboxes
|
||||||
|
logos - [array] List of icon ID, width, height and link.
|
||||||
|
|
||||||
|
mixin infobox-logos(...logos)
|
||||||
|
.o-box__logos.u-text-right.u-float-right
|
||||||
|
for logo in logos
|
||||||
|
if logo[3]
|
||||||
|
| #[+a(logo[3]).u-inline-block.u-hide-link.u-padding-small #[+icon(logo[0], logo[1], logo[2]).u-color-dark]]
|
||||||
|
else
|
||||||
|
| #[+icon(logo[0], logo[1], logo[2]).u-color-dark]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
//- Link button
|
//- Link button
|
||||||
url - [string] link href
|
url - [string] link href
|
||||||
trusted - [boolean] if not set / false, rel="noopener nofollow" is added
|
trusted - [boolean] if not set / false, rel="noopener nofollow" is added
|
||||||
|
@ -94,7 +119,7 @@ mixin infobox(label)
|
||||||
see assets/css/_components/_buttons.sass
|
see assets/css/_components/_buttons.sass
|
||||||
|
|
||||||
mixin button(url, trusted, ...style)
|
mixin button(url, trusted, ...style)
|
||||||
- external = url.includes("http")
|
- external = url && url.includes("http")
|
||||||
a.c-button.u-text-label(href=url class=prefixArgs(style, "c-button") role="button" target=external ? "_blank" : null rel=external && !trusted ? "noopener nofollow" : null)&attributes(attributes)
|
a.c-button.u-text-label(href=url class=prefixArgs(style, "c-button") role="button" target=external ? "_blank" : null rel=external && !trusted ? "noopener nofollow" : null)&attributes(attributes)
|
||||||
block
|
block
|
||||||
|
|
||||||
|
@ -103,31 +128,33 @@ mixin button(url, trusted, ...style)
|
||||||
label - [string] aside title (optional or false for no label)
|
label - [string] aside title (optional or false for no label)
|
||||||
language - [string] language for syntax highlighting (default: "python")
|
language - [string] language for syntax highlighting (default: "python")
|
||||||
supports basic relevant languages available for PrismJS
|
supports basic relevant languages available for PrismJS
|
||||||
prompt - [string] prompt or icon to display next to code block, (mostly used for old/new)
|
prompt - [string] prompt displayed before first line, e.g. "$"
|
||||||
height - [integer] optional height to clip code block to
|
height - [integer] optional height to clip code block to
|
||||||
|
icon - [string] icon displayed next to code block (e.g. "accept" for new code)
|
||||||
|
wrap - [boolean] wrap text and disable horizontal scrolling
|
||||||
|
|
||||||
mixin code(label, language, prompt, height)
|
mixin code(label, language, prompt, height, icon, wrap)
|
||||||
pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : null style=height ? "height: #{height}px" : null)&attributes(attributes)
|
pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : null style=height ? "height: #{height}px" : null)&attributes(attributes)
|
||||||
if label
|
if label
|
||||||
h4.u-text-label.u-text-label--dark=label
|
h4.u-text-label.u-text-label--dark=label
|
||||||
- var icon = (prompt == 'accept' || prompt == 'reject')
|
- var icon = icon || (prompt == 'accept' || prompt == 'reject')
|
||||||
if icon
|
if icon
|
||||||
- var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'}
|
- var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'}
|
||||||
.c-code-block__icon(class=classes[icon] || null class=classes[icon] ? "c-code-block__icon--border" : null)
|
.c-code-block__icon(class=classes[icon] || null class=classes[icon] ? "c-code-block__icon--border" : null)
|
||||||
+icon(icon, 18)
|
+icon(icon, 18)
|
||||||
|
|
||||||
code.c-code-block__content(data-prompt=icon ? null : prompt)
|
code.c-code-block__content(class=wrap ? "u-wrap" : null data-prompt=icon ? null : prompt)
|
||||||
block
|
block
|
||||||
|
|
||||||
|
|
||||||
//- Code blocks to display old/new versions
|
//- Code blocks to display old/new versions
|
||||||
|
|
||||||
mixin code-old()
|
mixin code-old()
|
||||||
+code(false, false, "reject").o-block-small
|
+code(false, false, false, false, "reject").o-block-small
|
||||||
block
|
block
|
||||||
|
|
||||||
mixin code-new()
|
mixin code-new()
|
||||||
+code(false, false, "accept").o-block-small
|
+code(false, false, false, false, "accept").o-block-small
|
||||||
block
|
block
|
||||||
|
|
||||||
|
|
||||||
|
@ -138,12 +165,33 @@ mixin code-new()
|
||||||
|
|
||||||
mixin codepen(slug, height, default_tab)
|
mixin codepen(slug, height, default_tab)
|
||||||
figure.o-block(style="min-height: #{height}px")&attributes(attributes)
|
figure.o-block(style="min-height: #{height}px")&attributes(attributes)
|
||||||
.codepen(data-height=height data-theme-id="26467" data-slug-hash=slug data-default-tab=(default_tab || "result") data-embed-version="2" data-user=SOCIAL.codepen)
|
.codepen(data-height=height data-theme-id="31335" data-slug-hash=slug data-default-tab=(default_tab || "result") data-embed-version="2" data-user=SOCIAL.codepen)
|
||||||
+a("https://codepen.io/" + SOCIAL.codepen + "/" + slug) View on CodePen
|
+a("https://codepen.io/" + SOCIAL.codepen + "/" + slug) View on CodePen
|
||||||
|
|
||||||
script(async src="https://assets.codepen.io/assets/embed/ei.js")
|
script(async src="https://assets.codepen.io/assets/embed/ei.js")
|
||||||
|
|
||||||
|
|
||||||
|
//- GitHub embed
|
||||||
|
repo - [string] repository owned by explosion organization
|
||||||
|
file - [string] logical path to file, relative to repository root
|
||||||
|
alt_file - [string] alternative file path used in footer and link button
|
||||||
|
height - [integer] height of code preview in px
|
||||||
|
|
||||||
|
mixin github(repo, file, alt_file, height)
|
||||||
|
- var branch = ALPHA ? "develop" : "master"
|
||||||
|
- var height = height || 250
|
||||||
|
|
||||||
|
figure.o-block
|
||||||
|
pre.c-code-block.o-block-small(class="lang-#{(language || DEFAULT_SYNTAX)}" style="height: #{height}px; min-height: #{height}px")
|
||||||
|
code.c-code-block__content(data-gh-embed="#{repo}/#{branch}/#{file}")
|
||||||
|
|
||||||
|
footer.o-grid.u-text
|
||||||
|
.o-block-small.u-flex-full #[+icon("github")] #[code=repo + '/' + (alt_file || file)]
|
||||||
|
div
|
||||||
|
+button(gh(repo, alt_file || file), false, "primary", "small") View on GitHub
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
//- Images / figures
|
//- Images / figures
|
||||||
url - [string] url or path to image
|
url - [string] url or path to image
|
||||||
width - [integer] image width in px, for better rendering (default: 500)
|
width - [integer] image width in px, for better rendering (default: 500)
|
||||||
|
@ -168,10 +216,26 @@ mixin image-caption()
|
||||||
block
|
block
|
||||||
|
|
||||||
|
|
||||||
//- Label
|
//- Graphic or illustration with button
|
||||||
|
original - [string] Path to original image
|
||||||
|
|
||||||
|
mixin graphic(original)
|
||||||
|
+image
|
||||||
|
block
|
||||||
|
if original
|
||||||
|
.u-text-right
|
||||||
|
+button(original, false, "secondary", "small") View large graphic
|
||||||
|
|
||||||
|
|
||||||
|
//- Labels
|
||||||
|
|
||||||
mixin label()
|
mixin label()
|
||||||
.u-text-label.u-color-subtle&attributes(attributes)
|
.u-text-label.u-color-dark&attributes(attributes)
|
||||||
|
block
|
||||||
|
|
||||||
|
|
||||||
|
mixin label-inline()
|
||||||
|
strong.u-text-label.u-color-dark&attributes(attributes)
|
||||||
block
|
block
|
||||||
|
|
||||||
|
|
||||||
|
@ -188,8 +252,10 @@ mixin tag()
|
||||||
mixin tag-model(...capabs)
|
mixin tag-model(...capabs)
|
||||||
- var intro = "To use this functionality, spaCy needs a model to be installed"
|
- var intro = "To use this functionality, spaCy needs a model to be installed"
|
||||||
- var ext = capabs.length ? " that supports the following capabilities: " + capabs.join(', ') : ""
|
- var ext = capabs.length ? " that supports the following capabilities: " + capabs.join(', ') : ""
|
||||||
+tag Requires model
|
|
||||||
+help(intro + ext + ".").u-color-theme
|
span.u-nowrap
|
||||||
|
+tag Needs model
|
||||||
|
+help(intro + ext + ".").u-color-theme
|
||||||
|
|
||||||
|
|
||||||
//- "New" tag to label features new in a specific version
|
//- "New" tag to label features new in a specific version
|
||||||
|
@ -219,15 +285,9 @@ mixin list(type, start)
|
||||||
|
|
||||||
//- List item (only used within +list)
|
//- List item (only used within +list)
|
||||||
|
|
||||||
mixin item(procon)
|
mixin item()
|
||||||
if procon
|
li.c-list__item&attributes(attributes)
|
||||||
li&attributes(attributes)
|
block
|
||||||
+procon(procon).c-list__icon
|
|
||||||
block
|
|
||||||
|
|
||||||
else
|
|
||||||
li.c-list__item&attributes(attributes)
|
|
||||||
block
|
|
||||||
|
|
||||||
|
|
||||||
//- Table
|
//- Table
|
||||||
|
@ -237,9 +297,9 @@ mixin table(head)
|
||||||
table.c-table.o-block&attributes(attributes)
|
table.c-table.o-block&attributes(attributes)
|
||||||
|
|
||||||
if head
|
if head
|
||||||
+row
|
+row("head")
|
||||||
each column in head
|
each column in head
|
||||||
th.c-table__head-cell.u-text-label=column
|
+head-cell=column
|
||||||
|
|
||||||
block
|
block
|
||||||
|
|
||||||
|
@ -251,10 +311,11 @@ mixin row(...style)
|
||||||
block
|
block
|
||||||
|
|
||||||
|
|
||||||
//- Footer table row (only ued within +table)
|
|
||||||
|
|
||||||
mixin footrow()
|
//- Header table cell (only used within +row)
|
||||||
tr.c-table__row.c-table__row--foot&attributes(attributes)
|
|
||||||
|
mixin head-cell()
|
||||||
|
th.c-table__head-cell.u-text-label&attributes(attributes)
|
||||||
block
|
block
|
||||||
|
|
||||||
|
|
||||||
|
@ -284,71 +345,58 @@ mixin grid-col(width)
|
||||||
|
|
||||||
|
|
||||||
//- Card (only used within +grid)
|
//- Card (only used within +grid)
|
||||||
title - [string] card title
|
title - [string] card title
|
||||||
details - [object] url, image, author, description, tags etc.
|
url - [string] link for card
|
||||||
(see /docs/usage/_data.json)
|
author - [string] optional author, displayed as byline at the bottom
|
||||||
|
icon - [string] optional ID of icon displayed with card
|
||||||
|
width - [string] optional width of grid column, defaults to "half"
|
||||||
|
|
||||||
mixin card(title, details)
|
mixin card(title, url, author, icon, width)
|
||||||
+grid-col("half").o-card.u-text&attributes(attributes)
|
+grid-col(width || "half").o-box.o-grid.o-grid--space.u-text&attributes(attributes)
|
||||||
if details.image
|
+a(url)
|
||||||
+a(details.url).o-block-small
|
h4.u-heading.u-text-label
|
||||||
img(src=details.image alt=title width="300" role="presentation")
|
if icon
|
||||||
|
+icon(icon, 25).u-float-right
|
||||||
if title
|
if title
|
||||||
+a(details.url)
|
span.u-color-dark=title
|
||||||
+h(3)=title
|
.o-block-small.u-text-small
|
||||||
|
block
|
||||||
if details.author
|
if author
|
||||||
.u-text-small.u-color-subtle by #{details.author}
|
.u-color-subtle.u-text-tiny by #{author}
|
||||||
|
|
||||||
if details.description || details.tags
|
|
||||||
ul
|
|
||||||
if details.description
|
|
||||||
li=details.description
|
|
||||||
|
|
||||||
if details.tags
|
|
||||||
li
|
|
||||||
each tag in details.tags
|
|
||||||
span.u-text-tag #{tag}
|
|
||||||
|
|
|
||||||
|
|
||||||
block
|
|
||||||
|
|
||||||
|
|
||||||
//- Simpler card list item (only used within +list)
|
//- Table of contents, to be used with +item mixins for links
|
||||||
title - [string] card title
|
col - [string] width of column (see +grid-col)
|
||||||
details - [object] url, image, author, description, tags etc.
|
|
||||||
(see /docs/usage/_data.json)
|
|
||||||
|
|
||||||
mixin card-item(title, details)
|
mixin table-of-contents(col)
|
||||||
+item&attributes(attributes)
|
+grid-col(col || "half")
|
||||||
+a(details.url)=title
|
+infobox
|
||||||
|
+label.o-block-small Table of contents
|
||||||
if details.description
|
+list("numbers").u-text-small.o-no-block
|
||||||
br
|
block
|
||||||
span=details.description
|
|
||||||
|
|
||||||
if details.author
|
|
||||||
br
|
|
||||||
span.u-text-small.u-color-subtle by #{details.author}
|
|
||||||
|
|
||||||
|
|
||||||
//- Table row for models table
|
//- Bibliography
|
||||||
|
id - [string] ID of bibliography component, for anchor links. Can be used if
|
||||||
|
there's more than one bibliography on one page.
|
||||||
|
|
||||||
mixin model-row(name, lang, procon, size, license, default_model, divider)
|
mixin bibliography(id)
|
||||||
- var licenses = { "CC BY-SA": "https://creativecommons.org/licenses/by-sa/3.0/", "CC BY-NC": "https://creativecommons.org/licenses/by-nc/3.0/" }
|
section(id=id || "bibliography")
|
||||||
|
+infobox
|
||||||
|
+label.o-block-small Bibliography
|
||||||
|
+list("numbers").u-text-small.o-no-block
|
||||||
|
block
|
||||||
|
|
||||||
+row(divider ? "divider": null)
|
|
||||||
+cell #[code=name]
|
//- Footnote
|
||||||
if default_model
|
id - [string / integer] ID of footnote.
|
||||||
| #[span.u-color-theme(title="default model") #[+icon("star", 16)]]
|
bib_id - [string] ID of bibliography component, defaults to "bibliography".
|
||||||
+cell=lang
|
tooltip - [string] optional text displayed as tooltip
|
||||||
each icon in procon
|
|
||||||
+cell.u-text-center #[+procon(icon ? "pro" : "con")]
|
mixin fn(id, bib_id, tooltip)
|
||||||
+cell.u-text-right=size
|
sup.u-padding-small(id="bib" + id data-tooltip=tooltip)
|
||||||
+cell
|
span.u-text-tag
|
||||||
if license in licenses
|
+a("#" + (bib_id || "bibliography")).u-hide-link #{id}
|
||||||
+a(licenses[license])=license
|
|
||||||
|
|
||||||
|
|
||||||
//- Table rows for annotation specs
|
//- Table rows for annotation specs
|
||||||
|
@ -383,14 +431,3 @@ mixin annotation-row(annots, style)
|
||||||
else
|
else
|
||||||
+cell=cell
|
+cell=cell
|
||||||
block
|
block
|
||||||
|
|
||||||
|
|
||||||
//- Table of contents, to be used with +item mixins for links
|
|
||||||
col - [string] width of column (see +grid-col)
|
|
||||||
|
|
||||||
mixin table-of-contents(col)
|
|
||||||
+grid-col(col || "half")
|
|
||||||
+infobox
|
|
||||||
+label.o-block-small Table of contents
|
|
||||||
+list("numbers").u-text-small.o-no-block
|
|
||||||
block
|
|
||||||
|
|
|
@ -1,19 +1,15 @@
|
||||||
//- 💫 INCLUDES > TOP NAVIGATION
|
//- 💫 INCLUDES > TOP NAVIGATION
|
||||||
|
|
||||||
include _mixins
|
|
||||||
|
|
||||||
nav.c-nav.u-text.js-nav(class=landing ? "c-nav--theme" : null)
|
nav.c-nav.u-text.js-nav(class=landing ? "c-nav--theme" : null)
|
||||||
a(href='/') #[+logo]
|
a(href="/" aria-label=SITENAME) #[+logo]
|
||||||
|
|
||||||
if SUBSECTION != "index"
|
|
||||||
.u-text-label.u-padding-small.u-hidden-xs=SUBSECTION
|
|
||||||
|
|
||||||
ul.c-nav__menu
|
ul.c-nav__menu
|
||||||
- var NAV = ALPHA ? { "Usage": "/docs/usage", "Reference": "/docs/api" } : NAVIGATION
|
- var current_url = '/' + current.path[0]
|
||||||
|
each url, item in NAVIGATION
|
||||||
each url, item in NAV
|
li.c-nav__menu__item(class=(current_url == url) ? "is-active" : null)
|
||||||
li.c-nav__menu__item(class=(url == "/") ? "u-hidden-xs" : null)
|
|
||||||
+a(url)=item
|
+a(url)=item
|
||||||
|
|
||||||
li.c-nav__menu__item
|
li.c-nav__menu__item.u-hidden-xs
|
||||||
+a(gh("spaCy"))(aria-label="GitHub").u-hidden-xs #[+icon("github", 20)]
|
+a(gh("spaCy"))(aria-label="GitHub") #[+icon("github", 20)]
|
||||||
|
|
||||||
|
progress.c-progress.js-progress(value="0" max="1")
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
//- 💫 INCLUDES > NEWSLETTER
|
//- 💫 INCLUDES > NEWSLETTER
|
||||||
|
|
||||||
ul.o-block
|
ul.o-block-small
|
||||||
li.u-text-label.u-color-subtle Stay in the loop!
|
li.u-text-label.u-color-subtle Stay in the loop!
|
||||||
li Receive updates about new releases, tutorials and more.
|
li Receive updates about new releases, tutorials and more.
|
||||||
|
|
||||||
|
@ -10,7 +10,6 @@ form.o-grid#mc-embedded-subscribe-form(action="//#{MAILCHIMP.user}.list-manage.c
|
||||||
div(style="position: absolute; left: -5000px;" aria-hidden="true")
|
div(style="position: absolute; left: -5000px;" aria-hidden="true")
|
||||||
input(type="text" name="b_#{MAILCHIMP.id}_#{MAILCHIMP.list}" tabindex="-1" value="")
|
input(type="text" name="b_#{MAILCHIMP.id}_#{MAILCHIMP.list}" tabindex="-1" value="")
|
||||||
|
|
||||||
.o-grid-col.u-border.u-padding-small
|
.o-grid-col.o-grid.o-grid--nowrap.o-field.u-padding-small
|
||||||
input#mce-EMAIL.u-text(type="email" name="EMAIL" placeholder="Your email")
|
input#mce-EMAIL.o-field__input.u-text(type="email" name="EMAIL" placeholder="Your email" aria-label="Your email")
|
||||||
|
button#mc-embedded-subscribe.o-field__button.u-text-label.u-color-theme.u-nowrap(type="submit" name="subscribe") Sign up
|
||||||
button#mc-embedded-subscribe.u-text-label.u-color-theme(type="submit" name="subscribe") Sign up
|
|
||||||
|
|
|
@ -1,47 +1,56 @@
|
||||||
//- 💫 INCLUDES > DOCS PAGE TEMPLATE
|
//- 💫 INCLUDES > DOCS PAGE TEMPLATE
|
||||||
|
|
||||||
- sidebar_content = (SUBSECTION != "index") ? public.docs[SUBSECTION]._data.sidebar : public.docs._data.sidebar || FOOTER
|
- sidebar_content = (public[SECTION] ? public[SECTION]._data.sidebar : public._data[SECTION] ? public._data[SECTION].sidebar : false) || FOOTER
|
||||||
|
|
||||||
include _sidebar
|
include _sidebar
|
||||||
|
|
||||||
main.o-main.o-main--sidebar.o-main--aside
|
main.o-main.o-main--sidebar.o-main--aside
|
||||||
article.o-content
|
article.o-content
|
||||||
+grid.o-no-block
|
+grid.o-no-block
|
||||||
+grid-col(source ? "two-thirds" : "full")
|
+h(1).u-heading--title=title.replace("'", "’")
|
||||||
+h(1)=title
|
if tag
|
||||||
if tag
|
+tag=tag
|
||||||
+tag=tag
|
if tag_new
|
||||||
|
+tag-new(tag_new)
|
||||||
|
|
||||||
|
if teaser
|
||||||
|
.u-heading__teaser.u-text-small.u-color-dark=teaser
|
||||||
|
else if IS_MODELS
|
||||||
|
.u-heading__teaser.u-text-small.u-color-dark
|
||||||
|
| Available statistical models for
|
||||||
|
| #[code=current.source] (#{LANGUAGES[current.source]}).
|
||||||
|
|
||||||
if source
|
if source
|
||||||
+grid-col("third").u-text-right
|
.o-block.u-text-right
|
||||||
.o-inline-list
|
+button(gh("spacy", source), false, "secondary", "small").u-nowrap
|
||||||
+button(gh("spacy", source), false, "secondary").u-text-tag Source #[+icon("code", 14)]
|
| Source #[+icon("code", 14)]
|
||||||
|
|
||||||
|
//-if ALPHA
|
||||||
|
//- +alpha-info
|
||||||
|
|
||||||
if ALPHA
|
if IS_MODELS
|
||||||
+infobox("⚠️ You are viewing the spaCy v2.0.0 alpha docs")
|
include _page_models
|
||||||
strong This page is part of the alpha documentation for spaCy v2.0.
|
else
|
||||||
| It does not reflect the state of the latest stable release.
|
!=yield
|
||||||
| Because v2.0 is still under development, the implementation
|
|
||||||
| may differ from the intended state described here. See the
|
|
||||||
| #[+a(gh("spaCy") + "/releases/tag/v2.0.0-alpha") release notes]
|
|
||||||
| for details on how to install and test the new version. To
|
|
||||||
| read the official docs for spaCy v1.x,
|
|
||||||
| #[+a("https://spacy.io/docs") go here].
|
|
||||||
|
|
||||||
!=yield
|
|
||||||
|
|
||||||
+grid.o-content.u-text
|
+grid.o-content.u-text
|
||||||
+grid-col("half")
|
+grid-col("half")
|
||||||
if next && public.docs[SUBSECTION]._data[next]
|
if !IS_MODELS
|
||||||
- data = public.docs[SUBSECTION]._data[next]
|
|
||||||
|
|
||||||
.o-inline-list
|
.o-inline-list
|
||||||
span #[strong.u-text-label Read next:] #[+a(next).u-link=data.title]
|
+button(gh("spacy", "website/" + current.path.join('/') + ".jade"), false, "secondary", "small")
|
||||||
|
| #[span.o-icon Suggest edits] #[+icon("code", 14)]
|
||||||
|
|
||||||
+grid-col("half").u-text-right
|
+grid-col("half").u-text-right
|
||||||
.o-inline-list
|
if next && public[SECTION]._data[next]
|
||||||
+button(gh("spacy", "website/" + current.path.join('/') + ".jade"), false, "secondary").u-text-tag Suggest edits #[+icon("code", 14)]
|
- data = public[SECTION]._data[next]
|
||||||
|
|
||||||
|
+grid("vcenter")
|
||||||
|
+a(next).u-text-small.u-flex-full
|
||||||
|
h4.u-text-label.u-color-dark Read next
|
||||||
|
| #{data.title}
|
||||||
|
|
||||||
|
+a(next).c-icon-button.c-icon-button--right(aria-hidden="true")
|
||||||
|
+icon("arrow-right", 24)
|
||||||
|
|
||||||
+gitter("spaCy chat")
|
+gitter("spaCy chat")
|
||||||
|
|
||||||
|
|
77
website/_includes/_page_models.jade
Normal file
77
website/_includes/_page_models.jade
Normal file
|
@ -0,0 +1,77 @@
|
||||||
|
//- 💫 INCLUDES > MODELS PAGE TEMPLATE
|
||||||
|
|
||||||
|
for id in CURRENT_MODELS
|
||||||
|
+section(id)
|
||||||
|
+grid("vcenter").o-no-block(id=id)
|
||||||
|
+grid-col("two-thirds")
|
||||||
|
+h(2)
|
||||||
|
+a("#" + id).u-permalink=id
|
||||||
|
|
||||||
|
+grid-col("third").u-text-right
|
||||||
|
.u-color-subtle.u-text-tiny
|
||||||
|
+button(gh("spacy-models") + "/releases", true, "secondary", "small")(data-tpl=id data-tpl-key="download")
|
||||||
|
| Release details
|
||||||
|
.u-padding-small Latest: #[code(data-tpl=id data-tpl-key="version") n/a]
|
||||||
|
|
||||||
|
+aside-code("Installation", "bash", "$").
|
||||||
|
spacy download #{id}
|
||||||
|
|
||||||
|
- var comps = getModelComponents(id)
|
||||||
|
|
||||||
|
p(data-tpl=id data-tpl-key="description")
|
||||||
|
|
||||||
|
div(data-tpl=id data-tpl-key="error" style="display: none")
|
||||||
|
+infobox
|
||||||
|
| Unable to load model details from GitHub. To find out more
|
||||||
|
| about this model, see the overview of the
|
||||||
|
| #[+a(gh("spacy-models") + "/releases") latest model releases].
|
||||||
|
|
||||||
|
+table(data-tpl=id data-tpl-key="table")
|
||||||
|
+row
|
||||||
|
+cell #[+label Language]
|
||||||
|
+cell #[+tag=comps.lang] #{LANGUAGES[comps.lang]}
|
||||||
|
for comp, label in {"Type": comps.type, "Genre": comps.genre}
|
||||||
|
+row
|
||||||
|
+cell #[+label=label]
|
||||||
|
+cell #[+tag=comp] #{MODEL_META[comp]}
|
||||||
|
+row
|
||||||
|
+cell #[+label Size]
|
||||||
|
+cell #[+tag=comps.size] #[span(data-tpl=id data-tpl-key="size") #[em n/a]]
|
||||||
|
|
||||||
|
each label in ["Pipeline", "Sources", "Author", "License"]
|
||||||
|
- var field = label.toLowerCase()
|
||||||
|
+row
|
||||||
|
+cell.u-nowrap
|
||||||
|
+label=label
|
||||||
|
if MODEL_META[field]
|
||||||
|
| #[+help(MODEL_META[field]).u-color-subtle]
|
||||||
|
+cell
|
||||||
|
span(data-tpl=id data-tpl-key=field) #[em n/a]
|
||||||
|
|
||||||
|
+row(data-tpl=id data-tpl-key="compat-wrapper" style="display: none")
|
||||||
|
+cell
|
||||||
|
+label Compat #[+help("Latest compatible model version for your spaCy installation").u-color-subtle]
|
||||||
|
+cell
|
||||||
|
.o-field.u-float-left
|
||||||
|
select.o-field__select.u-text-small(data-tpl=id data-tpl-key="compat")
|
||||||
|
.o-empty(data-tpl=id data-tpl-key="compat-versions")
|
||||||
|
|
||||||
|
section(data-tpl=id data-tpl-key="accuracy-wrapper" style="display: none")
|
||||||
|
+grid.o-no-block
|
||||||
|
+grid-col("third")
|
||||||
|
+h(4) Accuracy
|
||||||
|
+table.o-block-small
|
||||||
|
for label, field in MODEL_ACCURACY
|
||||||
|
+row(style="display: none")
|
||||||
|
+cell.u-nowrap
|
||||||
|
+label=label
|
||||||
|
if MODEL_META[field]
|
||||||
|
| #[+help(MODEL_META[field]).u-color-subtle]
|
||||||
|
+cell.u-text-right(data-tpl=id data-tpl-key=field)
|
||||||
|
| n/a
|
||||||
|
|
||||||
|
+grid-col("two-thirds")
|
||||||
|
+h(4) Comparison
|
||||||
|
+chart(id).u-padding-small
|
||||||
|
|
||||||
|
p.u-text-small.u-color-dark(data-tpl=id data-tpl-key="notes")
|
|
@ -1,27 +1,46 @@
|
||||||
//- 💫 INCLUDES > SCRIPTS
|
//- 💫 INCLUDES > SCRIPTS
|
||||||
|
|
||||||
script(src="/assets/js/main.js?v#{V_JS}")
|
if quickstart
|
||||||
script(src="/assets/js/prism.js")
|
script(src="/assets/js/quickstart.min.js")
|
||||||
|
|
||||||
if SECTION == "docs"
|
if IS_PAGE
|
||||||
if quickstart
|
script(src="/assets/js/in-view.min.js")
|
||||||
script(src="/assets/js/quickstart.js")
|
|
||||||
script var qs = new Quickstart("#qs")
|
|
||||||
|
|
||||||
script.
|
if HAS_MODELS
|
||||||
((window.gitter = {}).chat = {}).options = {
|
script(src="/assets/js/chart.min.js")
|
||||||
useStyles: false,
|
|
||||||
activationElement: '.js-gitter-button',
|
|
||||||
targetElement: '.js-gitter',
|
|
||||||
room: '!{SOCIAL.gitter}'
|
|
||||||
};
|
|
||||||
|
|
||||||
script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer)
|
|
||||||
|
|
||||||
if environment == "deploy"
|
if environment == "deploy"
|
||||||
script
|
script(async src="https://www.google-analytics.com/analytics.js")
|
||||||
|
|
||||||
|
script(src="/assets/js/prism.min.js")
|
||||||
|
script(src="/assets/js/main.js?v#{V_JS}")
|
||||||
|
|
||||||
|
script
|
||||||
|
| new ProgressBar('.js-progress');
|
||||||
|
|
||||||
|
if changelog
|
||||||
|
| new Changelog('!{SOCIAL.github}', 'spacy');
|
||||||
|
|
||||||
|
if quickstart
|
||||||
|
| new Quickstart("#qs");
|
||||||
|
|
||||||
|
if IS_PAGE
|
||||||
|
| new SectionHighlighter('data-section', 'data-nav');
|
||||||
|
| new GitHubEmbed('!{SOCIAL.github}', 'data-gh-embed');
|
||||||
|
| ((window.gitter = {}).chat = {}).options = {
|
||||||
|
| useStyles: false,
|
||||||
|
| activationElement: '.js-gitter-button',
|
||||||
|
| targetElement: '.js-gitter',
|
||||||
|
| room: '!{SOCIAL.gitter}'
|
||||||
|
| };
|
||||||
|
|
||||||
|
if HAS_MODELS
|
||||||
|
| new ModelLoader('!{MODELS_REPO}', !{JSON.stringify(CURRENT_MODELS)}, !{JSON.stringify(MODEL_LICENSES)}, !{JSON.stringify(MODEL_ACCURACY)});
|
||||||
|
|
||||||
|
if environment == "deploy"
|
||||||
| window.ga=window.ga||function(){
|
| window.ga=window.ga||function(){
|
||||||
| (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date;
|
| (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date;
|
||||||
| ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview');
|
| ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview');
|
||||||
|
|
||||||
script(async src="https://www.google-analytics.com/analytics.js")
|
if IS_PAGE
|
||||||
|
script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer)
|
||||||
|
|
|
@ -1,13 +1,23 @@
|
||||||
//- 💫 INCLUDES > SIDEBAR
|
//- 💫 INCLUDES > SIDEBAR
|
||||||
|
|
||||||
include _mixins
|
|
||||||
|
|
||||||
menu.c-sidebar.js-sidebar.u-text
|
menu.c-sidebar.js-sidebar.u-text
|
||||||
if sidebar_content
|
if sidebar_content
|
||||||
each items, menu in sidebar_content
|
each items, sectiontitle in sidebar_content
|
||||||
ul.c-sidebar__section.o-block
|
ul.c-sidebar__section.o-block-small
|
||||||
li.u-text-label.u-color-subtle=menu
|
li.u-text-label.u-color-dark=sectiontitle
|
||||||
|
|
||||||
each url, item in items
|
each url, item in items
|
||||||
li(class=(CURRENT == url || (CURRENT == "index" && url == "./")) ? "is-active" : null)
|
- var is_current = CURRENT == url || (CURRENT == "index" && url == "./")
|
||||||
+a(url)=item
|
li.c-sidebar__item
|
||||||
|
+a(url)(class=is_current ? "is-active" : null)=item
|
||||||
|
|
||||||
|
if is_current
|
||||||
|
if IS_MODELS && CURRENT_MODELS.length
|
||||||
|
- menu = Object.assign({}, ...CURRENT_MODELS.map(id => ({ [id]: id })))
|
||||||
|
if menu
|
||||||
|
ul.c-sidebar__crumb.u-hidden-sm
|
||||||
|
- var counter = 0
|
||||||
|
for id, title in menu
|
||||||
|
- counter++
|
||||||
|
li.c-sidebar__crumb__item(data-nav=id class=(counter == 1) ? "is-active" : null)
|
||||||
|
+a("#section-" + id)=title
|
||||||
|
|
157
website/_includes/_svg.jade
Normal file
157
website/_includes/_svg.jade
Normal file
File diff suppressed because one or more lines are too long
|
@ -2,11 +2,16 @@
|
||||||
|
|
||||||
include _includes/_mixins
|
include _includes/_mixins
|
||||||
|
|
||||||
|
- title = IS_MODELS ? LANGUAGES[current.source] || title : title
|
||||||
|
- social_title = (SECTION == "index") ? SITENAME + " - " + SLOGAN : title + " - " + SITENAME
|
||||||
|
- social_img = SITE_URL + "/assets/img/social/preview_" + (preview || ALPHA ? "alpha" : "default") + ".jpg"
|
||||||
|
|
||||||
doctype html
|
doctype html
|
||||||
html(lang="en")
|
html(lang="en")
|
||||||
title
|
title
|
||||||
if SECTION == "docs" && SUBSECTION && SUBSECTION != "index"
|
if SECTION == "api" || SECTION == "usage" || SECTION == "models"
|
||||||
| #{title} | #{SITENAME} #{SUBSECTION == "api" ? "API" : "Usage"} Documentation
|
- var title_section = (SECTION == "api") ? "API" : SECTION.charAt(0).toUpperCase() + SECTION.slice(1)
|
||||||
|
| #{title} | #{SITENAME} #{title_section} Documentation
|
||||||
|
|
||||||
else if SECTION != "index"
|
else if SECTION != "index"
|
||||||
| #{title} | #{SITENAME}
|
| #{title} | #{SITENAME}
|
||||||
|
@ -22,32 +27,30 @@ html(lang="en")
|
||||||
meta(property="og:type" content="website")
|
meta(property="og:type" content="website")
|
||||||
meta(property="og:site_name" content=sitename)
|
meta(property="og:site_name" content=sitename)
|
||||||
meta(property="og:url" content="#{SITE_URL}/#{current.path.join('/')}")
|
meta(property="og:url" content="#{SITE_URL}/#{current.path.join('/')}")
|
||||||
meta(property="og:title" content="#{title} - spaCy")
|
meta(property="og:title" content=social_title)
|
||||||
meta(property="og:description" content=description)
|
meta(property="og:description" content=description)
|
||||||
meta(property="og:image" content=getSocialImg())
|
meta(property="og:image" content=social_img)
|
||||||
|
|
||||||
meta(name="twitter:card" content="summary_large_image")
|
meta(name="twitter:card" content="summary_large_image")
|
||||||
meta(name="twitter:site" content="@" + SOCIAL.twitter)
|
meta(name="twitter:site" content="@" + SOCIAL.twitter)
|
||||||
meta(name="twitter:title" content="#{title} - spaCy")
|
meta(name="twitter:title" content=social_title)
|
||||||
meta(name="twitter:description" content=description)
|
meta(name="twitter:description" content=description)
|
||||||
meta(name="twitter:image" content=getSocialImg())
|
meta(name="twitter:image" content=social_img)
|
||||||
|
|
||||||
link(rel="shortcut icon" href="/assets/img/favicon.ico")
|
link(rel="shortcut icon" href="/assets/img/favicon.ico")
|
||||||
link(rel="icon" type="image/x-icon" href="/assets/img/favicon.ico")
|
link(rel="icon" type="image/x-icon" href="/assets/img/favicon.ico")
|
||||||
|
|
||||||
if ALPHA && SECTION == "docs"
|
if SECTION == "api"
|
||||||
link(href="/assets/css/style_green.css?v#{V_CSS}" rel="stylesheet")
|
link(href="/assets/css/style_green.css?v#{V_CSS}" rel="stylesheet")
|
||||||
|
|
||||||
else if SUBSECTION == "usage"
|
|
||||||
link(href="/assets/css/style_red.css?v#{V_CSS}" rel="stylesheet")
|
|
||||||
|
|
||||||
else
|
else
|
||||||
link(href="/assets/css/style.css?v#{V_CSS}" rel="stylesheet")
|
link(href="/assets/css/style.css?v#{V_CSS}" rel="stylesheet")
|
||||||
|
|
||||||
body
|
body
|
||||||
|
include _includes/_svg
|
||||||
include _includes/_navigation
|
include _includes/_navigation
|
||||||
|
|
||||||
if SECTION == "docs"
|
if !landing
|
||||||
include _includes/_page-docs
|
include _includes/_page-docs
|
||||||
|
|
||||||
else
|
else
|
||||||
|
|
43
website/api/_annotation/_biluo.jade
Normal file
43
website/api/_annotation/_biluo.jade
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
//- 💫 DOCS > API > ANNOTATION > BILUO
|
||||||
|
|
||||||
|
+table([ "Tag", "Description" ])
|
||||||
|
+row
|
||||||
|
+cell #[code #[span.u-color-theme B] EGIN]
|
||||||
|
+cell The first token of a multi-token entity.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code #[span.u-color-theme I] N]
|
||||||
|
+cell An inner token of a multi-token entity.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code #[span.u-color-theme L] AST]
|
||||||
|
+cell The final token of a multi-token entity.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code #[span.u-color-theme U] NIT]
|
||||||
|
+cell A single-token entity.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code #[span.u-color-theme O] UT]
|
||||||
|
+cell A non-entity token.
|
||||||
|
|
||||||
|
+aside("Why BILUO, not IOB?")
|
||||||
|
| There are several coding schemes for encoding entity annotations as
|
||||||
|
| token tags. These coding schemes are equally expressive, but not
|
||||||
|
| necessarily equally learnable.
|
||||||
|
| #[+a("http://www.aclweb.org/anthology/W09-1119") Ratinov and Roth]
|
||||||
|
| showed that the minimal #[strong Begin], #[strong In], #[strong Out]
|
||||||
|
| scheme was more difficult to learn than the #[strong BILUO] scheme that
|
||||||
|
| we use, which explicitly marks boundary tokens.
|
||||||
|
|
||||||
|
p
|
||||||
|
| spaCy translates the character offsets into this scheme, in order to
|
||||||
|
| decide the cost of each action given the current state of the entity
|
||||||
|
| recogniser. The costs are then used to calculate the gradient of the
|
||||||
|
| loss, to train the model. The exact algorithm is a pastiche of
|
||||||
|
| well-known methods, and is not currently described in any single
|
||||||
|
| publication. The model is a greedy transition-based parser guided by a
|
||||||
|
| linear model whose weights are learned using the averaged perceptron
|
||||||
|
| loss, via the #[+a("http://www.aclweb.org/anthology/C12-1059") dynamic oracle]
|
||||||
|
| imitation learning strategy. The transition system is equivalent to the
|
||||||
|
| BILOU tagging scheme.
|
115
website/api/_architecture/_cython.jade
Normal file
115
website/api/_architecture/_cython.jade
Normal file
|
@ -0,0 +1,115 @@
|
||||||
|
//- 💫 DOCS > API > ARCHITECTURE > CYTHON
|
||||||
|
|
||||||
|
+aside("What's Cython?")
|
||||||
|
| #[+a("http://cython.org/") Cython] is a language for writing
|
||||||
|
| C extensions for Python. Most Python code is also valid Cython, but
|
||||||
|
| you can add type declarations to get efficient memory-managed code
|
||||||
|
| just like C or C++.
|
||||||
|
|
||||||
|
p
|
||||||
|
| spaCy's core data structures are implemented as
|
||||||
|
| #[+a("http://cython.org/") Cython] #[code cdef] classes. Memory is
|
||||||
|
| managed through the #[+a(gh("cymem")) #[code cymem]]
|
||||||
|
| #[code cymem.Pool] class, which allows you
|
||||||
|
| to allocate memory which will be freed when the #[code Pool] object
|
||||||
|
| is garbage collected. This means you usually don't have to worry
|
||||||
|
| about freeing memory. You just have to decide which Python object
|
||||||
|
| owns the memory, and make it own the #[code Pool]. When that object
|
||||||
|
| goes out of scope, the memory will be freed. You do have to take
|
||||||
|
| care that no pointers outlive the object that owns them — but this
|
||||||
|
| is generally quite easy.
|
||||||
|
|
||||||
|
p
|
||||||
|
| All Cython modules should have the #[code # cython: infer_types=True]
|
||||||
|
| compiler directive at the top of the file. This makes the code much
|
||||||
|
| cleaner, as it avoids the need for many type declarations. If
|
||||||
|
| possible, you should prefer to declare your functions #[code nogil],
|
||||||
|
| even if you don't especially care about multi-threading. The reason
|
||||||
|
| is that #[code nogil] functions help the Cython compiler reason about
|
||||||
|
| your code quite a lot — you're telling the compiler that no Python
|
||||||
|
| dynamics are possible. This lets many errors be raised, and ensures
|
||||||
|
| your function will run at C speed.
|
||||||
|
|
||||||
|
|
||||||
|
p
|
||||||
|
| Cython gives you many choices of sequences: you could have a Python
|
||||||
|
| list, a numpy array, a memory view, a C++ vector, or a pointer.
|
||||||
|
| Pointers are preferred, because they are fastest, have the most
|
||||||
|
| explicit semantics, and let the compiler check your code more
|
||||||
|
| strictly. C++ vectors are also great — but you should only use them
|
||||||
|
| internally in functions. It's less friendly to accept a vector as an
|
||||||
|
| argument, because that asks the user to do much more work. Here's
|
||||||
|
| how to get a pointer from a numpy array, memory view or vector:
|
||||||
|
|
||||||
|
+code.
|
||||||
|
cdef void get_pointers(np.ndarray[int, mode='c'] numpy_array, vector[int] cpp_vector, int[::1] memory_view) nogil:
|
||||||
|
pointer1 = <int*>numpy_array.data
|
||||||
|
pointer2 = cpp_vector.data()
|
||||||
|
pointer3 = &memory_view[0]
|
||||||
|
|
||||||
|
p
|
||||||
|
| Both C arrays and C++ vectors reassure the compiler that no Python
|
||||||
|
| operations are possible on your variable. This is a big advantage:
|
||||||
|
| it lets the Cython compiler raise many more errors for you.
|
||||||
|
|
||||||
|
p
|
||||||
|
| When getting a pointer from a numpy array or memoryview, take care
|
||||||
|
| that the data is actually stored in C-contiguous order — otherwise
|
||||||
|
| you'll get a pointer to nonsense. The type-declarations in the code
|
||||||
|
| above should generate runtime errors if buffers with incorrect
|
||||||
|
| memory layouts are passed in. To iterate over the array, the
|
||||||
|
| following style is preferred:
|
||||||
|
|
||||||
|
+code.
|
||||||
|
cdef int c_total(const int* int_array, int length) nogil:
|
||||||
|
total = 0
|
||||||
|
for item in int_array[:length]:
|
||||||
|
total += item
|
||||||
|
return total
|
||||||
|
|
||||||
|
p
|
||||||
|
| If this is confusing, consider that the compiler couldn't deal with
|
||||||
|
| #[code for item in int_array:] — there's no length attached to a raw
|
||||||
|
| pointer, so how could we figure out where to stop? The length is
|
||||||
|
| provided in the slice notation as a solution to this. Note that we
|
||||||
|
| don't have to declare the type of #[code item] in the code above —
|
||||||
|
| the compiler can easily infer it. This gives us tidy code that looks
|
||||||
|
| quite like Python, but is exactly as fast as C — because we've made
|
||||||
|
| sure the compilation to C is trivial.
|
||||||
|
|
||||||
|
p
|
||||||
|
| Your functions cannot be declared #[code nogil] if they need to
|
||||||
|
| create Python objects or call Python functions. This is perfectly
|
||||||
|
| okay — you shouldn't torture your code just to get #[code nogil]
|
||||||
|
| functions. However, if your function isn't #[code nogil], you should
|
||||||
|
| compile your module with #[code cython -a --cplus my_module.pyx] and
|
||||||
|
| open the resulting #[code my_module.html] file in a browser. This
|
||||||
|
| will let you see how Cython is compiling your code. Calls into the
|
||||||
|
| Python run-time will be in bright yellow. This lets you easily see
|
||||||
|
| whether Cython is able to correctly type your code, or whether there
|
||||||
|
| are unexpected problems.
|
||||||
|
|
||||||
|
p
|
||||||
|
| Working in Cython is very rewarding once you're over the initial
|
||||||
|
| learning curve. As with C and C++, the first way you write something
|
||||||
|
| in Cython will often be the performance-optimal approach. In
|
||||||
|
| contrast, Python optimisation generally requires a lot of
|
||||||
|
| experimentation. Is it faster to have an #[code if item in my_dict]
|
||||||
|
| check, or to use #[code .get()]? What about
|
||||||
|
| #[code try]/#[code except]? Does this numpy operation create a copy?
|
||||||
|
| There's no way to guess the answers to these questions, and you'll
|
||||||
|
| usually be dissatisfied with your results — so there's no way to
|
||||||
|
| know when to stop this process. In the worst case, you'll make a
|
||||||
|
| mess that invites the next reader to try their luck too. This is
|
||||||
|
| like one of those
|
||||||
|
| #[+a("http://www.wemjournal.org/article/S1080-6032%2809%2970088-2/abstract") volcanic gas-traps],
|
||||||
|
| where the rescuers keep passing out from low oxygen, causing
|
||||||
|
| another rescuer to follow — only to succumb themselves. In short,
|
||||||
|
| just say no to optimizing your Python. If it's not fast enough the
|
||||||
|
| first time, just switch to Cython.
|
||||||
|
|
||||||
|
+infobox("Resources")
|
||||||
|
+list.o-no-block
|
||||||
|
+item #[+a("http://docs.cython.org/en/latest/") Official Cython documentation] (cython.org)
|
||||||
|
+item #[+a("https://explosion.ai/blog/writing-c-in-cython", true) Writing C in Cython] (explosion.ai)
|
||||||
|
+item #[+a("https://explosion.ai/blog/multithreading-with-cython") Multi-threading spaCy’s parser and named entity recogniser] (explosion.ai)
|
141
website/api/_architecture/_nn-model.jade
Normal file
141
website/api/_architecture/_nn-model.jade
Normal file
|
@ -0,0 +1,141 @@
|
||||||
|
//- 💫 DOCS > API > ARCHITECTURE > NN MODEL ARCHITECTURE
|
||||||
|
|
||||||
|
p
|
||||||
|
| The parsing model is a blend of recent results. The two recent
|
||||||
|
| inspirations have been the work of Eli Klipperwasser and Yoav Goldberg at
|
||||||
|
| Bar Ilan#[+fn(1)], and the SyntaxNet team from Google. The foundation of
|
||||||
|
| the parser is still based on the work of Joakim Nivre#[+fn(2)], who
|
||||||
|
| introduced the transition-based framework#[+fn(3)], the arc-eager
|
||||||
|
| transition system, and the imitation learning objective. The model is
|
||||||
|
| implemented using #[+a(gh("thinc")) Thinc], spaCy's machine learning
|
||||||
|
| library. We first predict context-sensitive vectors for each word in the
|
||||||
|
| input:
|
||||||
|
|
||||||
|
+code.
|
||||||
|
(embed_lower | embed_prefix | embed_suffix | embed_shape)
|
||||||
|
>> Maxout(token_width)
|
||||||
|
>> convolution ** 4
|
||||||
|
|
||||||
|
p
|
||||||
|
| This convolutional layer is shared between the tagger, parser and NER,
|
||||||
|
| and will also be shared by the future neural lemmatizer. Because the
|
||||||
|
| parser shares these layers with the tagger, the parser does not require
|
||||||
|
| tag features. I got this trick from David Weiss's "Stack Combination"
|
||||||
|
| paper#[+fn(4)].
|
||||||
|
|
||||||
|
p
|
||||||
|
| To boost the representation, the tagger actually predicts a "super tag"
|
||||||
|
| with POS, morphology and dependency label#[+fn(5)]. The tagger predicts
|
||||||
|
| these supertags by adding a softmax layer onto the convolutional layer –
|
||||||
|
| so, we're teaching the convolutional layer to give us a representation
|
||||||
|
| that's one affine transform from this informative lexical information.
|
||||||
|
| This is obviously good for the parser (which backprops to the
|
||||||
|
| convolutions too). The parser model makes a state vector by concatenating
|
||||||
|
| the vector representations for its context tokens. The current context
|
||||||
|
| tokens:
|
||||||
|
|
||||||
|
+table
|
||||||
|
+row
|
||||||
|
+cell #[code S0], #[code S1], #[code S2]
|
||||||
|
+cell Top three words on the stack.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code B0], #[code B1]
|
||||||
|
+cell First two words of the buffer.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell.u-nowrap
|
||||||
|
| #[code S0L1], #[code S1L1], #[code S2L1], #[code B0L1],
|
||||||
|
| #[code B1L1]#[br]
|
||||||
|
| #[code S0L2], #[code S1L2], #[code S2L2], #[code B0L2],
|
||||||
|
| #[code B1L2]
|
||||||
|
+cell
|
||||||
|
| Leftmost and second leftmost children of #[code S0], #[code S1],
|
||||||
|
| #[code S2], #[code B0] and #[code B1].
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell.u-nowrap
|
||||||
|
| #[code S0R1], #[code S1R1], #[code S2R1], #[code B0R1],
|
||||||
|
| #[code B1R1]#[br]
|
||||||
|
| #[code S0R2], #[code S1R2], #[code S2R2], #[code B0R2],
|
||||||
|
| #[code B1R2]
|
||||||
|
+cell
|
||||||
|
| Rightmost and second rightmost children of #[code S0], #[code S1],
|
||||||
|
| #[code S2], #[code B0] and #[code B1].
|
||||||
|
|
||||||
|
p
|
||||||
|
| This makes the state vector quite long: #[code 13*T], where #[code T] is
|
||||||
|
| the token vector width (128 is working well). Fortunately, there's a way
|
||||||
|
| to structure the computation to save some expense (and make it more
|
||||||
|
| GPU-friendly).
|
||||||
|
|
||||||
|
p
|
||||||
|
| The parser typically visits #[code 2*N] states for a sentence of length
|
||||||
|
| #[code N] (although it may visit more, if it back-tracks with a
|
||||||
|
| non-monotonic transition#[+fn(4)]). A naive implementation would require
|
||||||
|
| #[code 2*N (B, 13*T) @ (13*T, H)] matrix multiplications for a batch of
|
||||||
|
| size #[code B]. We can instead perform one #[code (B*N, T) @ (T, 13*H)]
|
||||||
|
| multiplication, to pre-compute the hidden weights for each positional
|
||||||
|
| feature with respect to the words in the batch. (Note that our token
|
||||||
|
| vectors come from the CNN — so we can't play this trick over the
|
||||||
|
| vocabulary. That's how Stanford's NN parser#[+fn(3)] works — and why its
|
||||||
|
| model is so big.)
|
||||||
|
|
||||||
|
p
|
||||||
|
| This pre-computation strategy allows a nice compromise between
|
||||||
|
| GPU-friendliness and implementation simplicity. The CNN and the wide
|
||||||
|
| lower layer are computed on the GPU, and then the precomputed hidden
|
||||||
|
| weights are moved to the CPU, before we start the transition-based
|
||||||
|
| parsing process. This makes a lot of things much easier. We don't have to
|
||||||
|
| worry about variable-length batch sizes, and we don't have to implement
|
||||||
|
| the dynamic oracle in CUDA to train.
|
||||||
|
|
||||||
|
p
|
||||||
|
| Currently the parser's loss function is multilabel log loss#[+fn(6)], as
|
||||||
|
| the dynamic oracle allows multiple states to be 0 cost. This is defined
|
||||||
|
| as follows, where #[code gZ] is the sum of the scores assigned to gold
|
||||||
|
| classes:
|
||||||
|
|
||||||
|
+code.
|
||||||
|
(exp(score) / Z) - (exp(score) / gZ)
|
||||||
|
|
||||||
|
+bibliography
|
||||||
|
+item
|
||||||
|
| #[+a("https://www.semanticscholar.org/paper/Simple-and-Accurate-Dependency-Parsing-Using-Bidir-Kiperwasser-Goldberg/3cf31ecb2724b5088783d7c96a5fc0d5604cbf41") Simple and Accurate Dependency Parsing Using Bidirectional LSTM Feature Representations]
|
||||||
|
br
|
||||||
|
| Eliyahu Kiperwasser, Yoav Goldberg. (2016)
|
||||||
|
|
||||||
|
+item
|
||||||
|
| #[+a("https://www.semanticscholar.org/paper/A-Dynamic-Oracle-for-Arc-Eager-Dependency-Parsing-Goldberg-Nivre/22697256ec19ecc3e14fcfc63624a44cf9c22df4") A Dynamic Oracle for Arc-Eager Dependency Parsing]
|
||||||
|
br
|
||||||
|
| Yoav Goldberg, Joakim Nivre (2012)
|
||||||
|
|
||||||
|
+item
|
||||||
|
| #[+a("https://explosion.ai/blog/parsing-english-in-python") Parsing English in 500 Lines of Python]
|
||||||
|
br
|
||||||
|
| Matthew Honnibal (2013)
|
||||||
|
|
||||||
|
+item
|
||||||
|
| #[+a("https://www.semanticscholar.org/paper/Stack-propagation-Improved-Representation-Learning-Zhang-Weiss/0c133f79b23e8c680891d2e49a66f0e3d37f1466") Stack-propagation: Improved Representation Learning for Syntax]
|
||||||
|
br
|
||||||
|
| Yuan Zhang, David Weiss (2016)
|
||||||
|
|
||||||
|
+item
|
||||||
|
| #[+a("https://www.semanticscholar.org/paper/Deep-multi-task-learning-with-low-level-tasks-supe-S%C3%B8gaard-Goldberg/03ad06583c9721855ccd82c3d969a01360218d86") Deep multi-task learning with low level tasks supervised at lower layers]
|
||||||
|
br
|
||||||
|
| Anders Søgaard, Yoav Goldberg (2016)
|
||||||
|
|
||||||
|
+item
|
||||||
|
| #[+a("https://www.semanticscholar.org/paper/An-Improved-Non-monotonic-Transition-System-for-De-Honnibal-Johnson/4094cee47ade13b77b5ab4d2e6cb9dd2b8a2917c") An Improved Non-monotonic Transition System for Dependency Parsing]
|
||||||
|
br
|
||||||
|
| Matthew Honnibal, Mark Johnson (2015)
|
||||||
|
|
||||||
|
+item
|
||||||
|
| #[+a("http://cs.stanford.edu/people/danqi/papers/emnlp2014.pdf") A Fast and Accurate Dependency Parser using Neural Networks]
|
||||||
|
br
|
||||||
|
| Danqi Cheng, Christopher D. Manning (2014)
|
||||||
|
|
||||||
|
+item
|
||||||
|
| #[+a("https://www.semanticscholar.org/paper/Parsing-the-Wall-Street-Journal-using-a-Lexical-Fu-Riezler-King/0ad07862a91cd59b7eb5de38267e47725a62b8b2") Parsing the Wall Street Journal using a Lexical-Functional Grammar and Discriminative Estimation Techniques]
|
||||||
|
br
|
||||||
|
| Stefan Riezler et al. (2002)
|
|
@ -1,29 +1,32 @@
|
||||||
{
|
{
|
||||||
"sidebar": {
|
"sidebar": {
|
||||||
"Introduction": {
|
"Overview": {
|
||||||
"Facts & Figures": "./",
|
"Architecture": "./",
|
||||||
"Languages": "language-models",
|
"Annotation Specs": "annotation",
|
||||||
"Annotation Specs": "annotation"
|
"Functions": "top-level"
|
||||||
},
|
},
|
||||||
"Top-level": {
|
"Containers": {
|
||||||
"spacy": "spacy",
|
|
||||||
"displacy": "displacy",
|
|
||||||
"Utility Functions": "util",
|
|
||||||
"Command line": "cli"
|
|
||||||
},
|
|
||||||
"Classes": {
|
|
||||||
"Doc": "doc",
|
"Doc": "doc",
|
||||||
"Token": "token",
|
"Token": "token",
|
||||||
"Span": "span",
|
"Span": "span",
|
||||||
|
"Lexeme": "lexeme"
|
||||||
|
},
|
||||||
|
|
||||||
|
"Pipeline": {
|
||||||
"Language": "language",
|
"Language": "language",
|
||||||
"Tokenizer": "tokenizer",
|
"Pipe": "pipe",
|
||||||
"Tensorizer": "tensorizer",
|
"Tensorizer": "tensorizer",
|
||||||
"Tagger": "tagger",
|
"Tagger": "tagger",
|
||||||
"DependencyParser": "dependencyparser",
|
"DependencyParser": "dependencyparser",
|
||||||
"EntityRecognizer": "entityrecognizer",
|
"EntityRecognizer": "entityrecognizer",
|
||||||
"TextCategorizer": "textcategorizer",
|
"TextCategorizer": "textcategorizer",
|
||||||
|
"Tokenizer": "tokenizer",
|
||||||
|
"Lemmatizer": "lemmatizer",
|
||||||
"Matcher": "matcher",
|
"Matcher": "matcher",
|
||||||
"Lexeme": "lexeme",
|
"PhraseMatcher": "phrasematcher"
|
||||||
|
},
|
||||||
|
|
||||||
|
"Other": {
|
||||||
"Vocab": "vocab",
|
"Vocab": "vocab",
|
||||||
"StringStore": "stringstore",
|
"StringStore": "stringstore",
|
||||||
"Vectors": "vectors",
|
"Vectors": "vectors",
|
||||||
|
@ -34,52 +37,37 @@
|
||||||
},
|
},
|
||||||
|
|
||||||
"index": {
|
"index": {
|
||||||
"title": "Facts & Figures",
|
"title": "Architecture",
|
||||||
"next": "language-models"
|
"next": "annotation",
|
||||||
|
"menu": {
|
||||||
|
"Basics": "basics",
|
||||||
|
"Neural Network Model": "nn-model",
|
||||||
|
"Cython Conventions": "cython"
|
||||||
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
"language-models": {
|
"top-level": {
|
||||||
"title": "Languages",
|
"title": "Top-level Functions",
|
||||||
"next": "philosophy"
|
"menu": {
|
||||||
},
|
"spacy": "spacy",
|
||||||
|
"displacy": "displacy",
|
||||||
"philosophy": {
|
"Utility Functions": "util",
|
||||||
"title": "Philosophy"
|
"Compatibility": "compat",
|
||||||
},
|
"Command Line": "cli"
|
||||||
|
}
|
||||||
"spacy": {
|
|
||||||
"title": "spaCy top-level functions",
|
|
||||||
"source": "spacy/__init__.py",
|
|
||||||
"next": "displacy"
|
|
||||||
},
|
|
||||||
|
|
||||||
"displacy": {
|
|
||||||
"title": "displaCy",
|
|
||||||
"tag": "module",
|
|
||||||
"source": "spacy/displacy",
|
|
||||||
"next": "util"
|
|
||||||
},
|
|
||||||
|
|
||||||
"util": {
|
|
||||||
"title": "Utility Functions",
|
|
||||||
"source": "spacy/util.py",
|
|
||||||
"next": "cli"
|
|
||||||
},
|
|
||||||
|
|
||||||
"cli": {
|
|
||||||
"title": "Command Line Interface",
|
|
||||||
"source": "spacy/cli"
|
|
||||||
},
|
},
|
||||||
|
|
||||||
"language": {
|
"language": {
|
||||||
"title": "Language",
|
"title": "Language",
|
||||||
"tag": "class",
|
"tag": "class",
|
||||||
|
"teaser": "A text-processing pipeline.",
|
||||||
"source": "spacy/language.py"
|
"source": "spacy/language.py"
|
||||||
},
|
},
|
||||||
|
|
||||||
"doc": {
|
"doc": {
|
||||||
"title": "Doc",
|
"title": "Doc",
|
||||||
"tag": "class",
|
"tag": "class",
|
||||||
|
"teaser": "A container for accessing linguistic annotations.",
|
||||||
"source": "spacy/tokens/doc.pyx"
|
"source": "spacy/tokens/doc.pyx"
|
||||||
},
|
},
|
||||||
|
|
||||||
|
@ -103,6 +91,7 @@
|
||||||
|
|
||||||
"vocab": {
|
"vocab": {
|
||||||
"title": "Vocab",
|
"title": "Vocab",
|
||||||
|
"teaser": "A storage class for vocabulary and other data shared across a language.",
|
||||||
"tag": "class",
|
"tag": "class",
|
||||||
"source": "spacy/vocab.pyx"
|
"source": "spacy/vocab.pyx"
|
||||||
},
|
},
|
||||||
|
@ -115,10 +104,27 @@
|
||||||
|
|
||||||
"matcher": {
|
"matcher": {
|
||||||
"title": "Matcher",
|
"title": "Matcher",
|
||||||
|
"teaser": "Match sequences of tokens, based on pattern rules.",
|
||||||
"tag": "class",
|
"tag": "class",
|
||||||
"source": "spacy/matcher.pyx"
|
"source": "spacy/matcher.pyx"
|
||||||
},
|
},
|
||||||
|
|
||||||
|
"phrasematcher": {
|
||||||
|
"title": "PhraseMatcher",
|
||||||
|
"teaser": "Match sequences of tokens, based on documents.",
|
||||||
|
"tag": "class",
|
||||||
|
"tag_new": 2,
|
||||||
|
"source": "spacy/matcher.pyx"
|
||||||
|
},
|
||||||
|
|
||||||
|
"pipe": {
|
||||||
|
"title": "Pipe",
|
||||||
|
"teaser": "Abstract base class defining the API for pipeline components.",
|
||||||
|
"tag": "class",
|
||||||
|
"tag_new": 2,
|
||||||
|
"source": "spacy/pipeline.pyx"
|
||||||
|
},
|
||||||
|
|
||||||
"dependenyparser": {
|
"dependenyparser": {
|
||||||
"title": "DependencyParser",
|
"title": "DependencyParser",
|
||||||
"tag": "class",
|
"tag": "class",
|
||||||
|
@ -127,18 +133,22 @@
|
||||||
|
|
||||||
"entityrecognizer": {
|
"entityrecognizer": {
|
||||||
"title": "EntityRecognizer",
|
"title": "EntityRecognizer",
|
||||||
|
"teaser": "Annotate named entities on documents.",
|
||||||
"tag": "class",
|
"tag": "class",
|
||||||
"source": "spacy/pipeline.pyx"
|
"source": "spacy/pipeline.pyx"
|
||||||
},
|
},
|
||||||
|
|
||||||
"textcategorizer": {
|
"textcategorizer": {
|
||||||
"title": "TextCategorizer",
|
"title": "TextCategorizer",
|
||||||
|
"teaser": "Add text categorization models to spaCy pipelines.",
|
||||||
"tag": "class",
|
"tag": "class",
|
||||||
|
"tag_new": 2,
|
||||||
"source": "spacy/pipeline.pyx"
|
"source": "spacy/pipeline.pyx"
|
||||||
},
|
},
|
||||||
|
|
||||||
"dependencyparser": {
|
"dependencyparser": {
|
||||||
"title": "DependencyParser",
|
"title": "DependencyParser",
|
||||||
|
"teaser": "Annotate syntactic dependencies on documents.",
|
||||||
"tag": "class",
|
"tag": "class",
|
||||||
"source": "spacy/pipeline.pyx"
|
"source": "spacy/pipeline.pyx"
|
||||||
},
|
},
|
||||||
|
@ -149,15 +159,23 @@
|
||||||
"source": "spacy/tokenizer.pyx"
|
"source": "spacy/tokenizer.pyx"
|
||||||
},
|
},
|
||||||
|
|
||||||
|
"lemmatizer": {
|
||||||
|
"title": "Lemmatizer",
|
||||||
|
"tag": "class"
|
||||||
|
},
|
||||||
|
|
||||||
"tagger": {
|
"tagger": {
|
||||||
"title": "Tagger",
|
"title": "Tagger",
|
||||||
|
"teaser": "Annotate part-of-speech tags on documents.",
|
||||||
"tag": "class",
|
"tag": "class",
|
||||||
"source": "spacy/pipeline.pyx"
|
"source": "spacy/pipeline.pyx"
|
||||||
},
|
},
|
||||||
|
|
||||||
"tensorizer": {
|
"tensorizer": {
|
||||||
"title": "Tensorizer",
|
"title": "Tensorizer",
|
||||||
|
"teaser": "Add a tensor with position-sensitive meaning representations to a document.",
|
||||||
"tag": "class",
|
"tag": "class",
|
||||||
|
"tag_new": 2,
|
||||||
"source": "spacy/pipeline.pyx"
|
"source": "spacy/pipeline.pyx"
|
||||||
},
|
},
|
||||||
|
|
||||||
|
@ -169,23 +187,38 @@
|
||||||
|
|
||||||
"goldcorpus": {
|
"goldcorpus": {
|
||||||
"title": "GoldCorpus",
|
"title": "GoldCorpus",
|
||||||
|
"teaser": "An annotated corpus, using the JSON file format.",
|
||||||
"tag": "class",
|
"tag": "class",
|
||||||
|
"tag_new": 2,
|
||||||
"source": "spacy/gold.pyx"
|
"source": "spacy/gold.pyx"
|
||||||
},
|
},
|
||||||
|
|
||||||
"binder": {
|
"binder": {
|
||||||
"title": "Binder",
|
"title": "Binder",
|
||||||
"tag": "class",
|
"tag": "class",
|
||||||
|
"tag_new": 2,
|
||||||
"source": "spacy/tokens/binder.pyx"
|
"source": "spacy/tokens/binder.pyx"
|
||||||
},
|
},
|
||||||
|
|
||||||
"vectors": {
|
"vectors": {
|
||||||
"title": "Vectors",
|
"title": "Vectors",
|
||||||
|
"teaser": "Store, save and load word vectors.",
|
||||||
"tag": "class",
|
"tag": "class",
|
||||||
|
"tag_new": 2,
|
||||||
"source": "spacy/vectors.pyx"
|
"source": "spacy/vectors.pyx"
|
||||||
},
|
},
|
||||||
|
|
||||||
"annotation": {
|
"annotation": {
|
||||||
"title": "Annotation Specifications"
|
"title": "Annotation Specifications",
|
||||||
|
"teaser": "Schemes used for labels, tags and training data.",
|
||||||
|
"menu": {
|
||||||
|
"Tokenization": "tokenization",
|
||||||
|
"Sentence Boundaries": "sbd",
|
||||||
|
"POS Tagging": "pos-tagging",
|
||||||
|
"Lemmatization": "lemmatization",
|
||||||
|
"Dependencies": "dependency-parsing",
|
||||||
|
"Named Entities": "named-entities",
|
||||||
|
"Training Data": "training"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -1,26 +1,17 @@
|
||||||
//- 💫 DOCS > USAGE > COMMAND LINE INTERFACE
|
//- 💫 DOCS > API > TOP-LEVEL > COMMAND LINE INTERFACE
|
||||||
|
|
||||||
include ../../_includes/_mixins
|
|
||||||
|
|
||||||
p
|
p
|
||||||
| As of v1.7.0, spaCy comes with new command line helpers to download and
|
| As of v1.7.0, spaCy comes with new command line helpers to download and
|
||||||
| link models and show useful debugging information. For a list of available
|
| link models and show useful debugging information. For a list of available
|
||||||
| commands, type #[code spacy --help].
|
| commands, type #[code spacy --help].
|
||||||
|
|
||||||
+infobox("⚠️ Deprecation note")
|
+h(3, "download") Download
|
||||||
| As of spaCy 2.0, the #[code model] command to initialise a model data
|
|
||||||
| directory is deprecated. The command was only necessary because previous
|
|
||||||
| versions of spaCy expected a model directory to already be set up. This
|
|
||||||
| has since been changed, so you can use the #[+api("cli#train") #[code train]]
|
|
||||||
| command straight away.
|
|
||||||
|
|
||||||
+h(2, "download") Download
|
|
||||||
|
|
||||||
p
|
p
|
||||||
| Download #[+a("/docs/usage/models") models] for spaCy. The downloader finds the
|
| Download #[+a("/usage/models") models] for spaCy. The downloader finds the
|
||||||
| best-matching compatible version, uses pip to download the model as a
|
| best-matching compatible version, uses pip to download the model as a
|
||||||
| package and automatically creates a
|
| package and automatically creates a
|
||||||
| #[+a("/docs/usage/models#usage") shortcut link] to load the model by name.
|
| #[+a("/usage/models#usage") shortcut link] to load the model by name.
|
||||||
| Direct downloads don't perform any compatibility checks and require the
|
| Direct downloads don't perform any compatibility checks and require the
|
||||||
| model name to be specified with its version (e.g., #[code en_core_web_sm-1.2.0]).
|
| model name to be specified with its version (e.g., #[code en_core_web_sm-1.2.0]).
|
||||||
|
|
||||||
|
@ -49,15 +40,15 @@ p
|
||||||
| detailed messages in case things go wrong. It's #[strong not recommended]
|
| detailed messages in case things go wrong. It's #[strong not recommended]
|
||||||
| to use this command as part of an automated process. If you know which
|
| to use this command as part of an automated process. If you know which
|
||||||
| model your project needs, you should consider a
|
| model your project needs, you should consider a
|
||||||
| #[+a("/docs/usage/models#download-pip") direct download via pip], or
|
| #[+a("/usage/models#download-pip") direct download via pip], or
|
||||||
| uploading the model to a local PyPi installation and fetching it straight
|
| uploading the model to a local PyPi installation and fetching it straight
|
||||||
| from there. This will also allow you to add it as a versioned package
|
| from there. This will also allow you to add it as a versioned package
|
||||||
| dependency to your project.
|
| dependency to your project.
|
||||||
|
|
||||||
+h(2, "link") Link
|
+h(3, "link") Link
|
||||||
|
|
||||||
p
|
p
|
||||||
| Create a #[+a("/docs/usage/models#usage") shortcut link] for a model,
|
| Create a #[+a("/usage/models#usage") shortcut link] for a model,
|
||||||
| either a Python package or a local directory. This will let you load
|
| either a Python package or a local directory. This will let you load
|
||||||
| models from any location using a custom name via
|
| models from any location using a custom name via
|
||||||
| #[+api("spacy#load") #[code spacy.load()]].
|
| #[+api("spacy#load") #[code spacy.load()]].
|
||||||
|
@ -95,7 +86,7 @@ p
|
||||||
+cell flag
|
+cell flag
|
||||||
+cell Show help message and available arguments.
|
+cell Show help message and available arguments.
|
||||||
|
|
||||||
+h(2, "info") Info
|
+h(3, "info") Info
|
||||||
|
|
||||||
p
|
p
|
||||||
| Print information about your spaCy installation, models and local setup,
|
| Print information about your spaCy installation, models and local setup,
|
||||||
|
@ -122,15 +113,15 @@ p
|
||||||
+cell flag
|
+cell flag
|
||||||
+cell Show help message and available arguments.
|
+cell Show help message and available arguments.
|
||||||
|
|
||||||
+h(2, "convert") Convert
|
+h(3, "convert") Convert
|
||||||
|
|
||||||
p
|
p
|
||||||
| Convert files into spaCy's #[+a("/docs/api/annotation#json-input") JSON format]
|
| Convert files into spaCy's #[+a("/api/annotation#json-input") JSON format]
|
||||||
| for use with the #[code train] command and other experiment management
|
| for use with the #[code train] command and other experiment management
|
||||||
| functions. The right converter is chosen based on the file extension of
|
| functions. The right converter is chosen based on the file extension of
|
||||||
| the input file. Currently only supports #[code .conllu].
|
| the input file. Currently only supports #[code .conllu].
|
||||||
|
|
||||||
+code(false, "bash", "$").
|
+code(false, "bash", "$", false, false, true).
|
||||||
spacy convert [input_file] [output_dir] [--n-sents] [--morphology]
|
spacy convert [input_file] [output_dir] [--n-sents] [--morphology]
|
||||||
|
|
||||||
+table(["Argument", "Type", "Description"])
|
+table(["Argument", "Type", "Description"])
|
||||||
|
@ -159,14 +150,18 @@ p
|
||||||
+cell flag
|
+cell flag
|
||||||
+cell Show help message and available arguments.
|
+cell Show help message and available arguments.
|
||||||
|
|
||||||
+h(2, "train") Train
|
+h(3, "train") Train
|
||||||
|
|
||||||
p
|
p
|
||||||
| Train a model. Expects data in spaCy's
|
| Train a model. Expects data in spaCy's
|
||||||
| #[+a("/docs/api/annotation#json-input") JSON format].
|
| #[+a("/api/annotation#json-input") JSON format]. On each epoch, a model
|
||||||
|
| will be saved out to the directory. Accuracy scores and model details
|
||||||
|
| will be added to a #[+a("/usage/training#models-generating") #[code meta.json]]
|
||||||
|
| to allow packaging the model using the
|
||||||
|
| #[+api("cli#package") #[code package]] command.
|
||||||
|
|
||||||
+code(false, "bash", "$").
|
+code(false, "bash", "$", false, false, true).
|
||||||
spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities]
|
spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--meta-path] [--vectors] [--no-tagger] [--no-parser] [--no-entities] [--gold-preproc]
|
||||||
|
|
||||||
+table(["Argument", "Type", "Description"])
|
+table(["Argument", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
|
@ -204,6 +199,27 @@ p
|
||||||
+cell option
|
+cell option
|
||||||
+cell Use GPU.
|
+cell Use GPU.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code --vectors], #[code -v]
|
||||||
|
+cell option
|
||||||
|
+cell Model to load vectors from.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code --meta-path], #[code -m]
|
||||||
|
+cell option
|
||||||
|
+cell
|
||||||
|
| #[+tag-new(2)] Optional path to model
|
||||||
|
| #[+a("/usage/training#models-generating") #[code meta.json]].
|
||||||
|
| All relevant properties like #[code lang], #[code pipeline] and
|
||||||
|
| #[code spacy_version] will be overwritten.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code --version], #[code -V]
|
||||||
|
+cell option
|
||||||
|
+cell
|
||||||
|
| Model version. Will be written out to the model's
|
||||||
|
| #[code meta.json] after training.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code --no-tagger], #[code -T]
|
+cell #[code --no-tagger], #[code -T]
|
||||||
+cell flag
|
+cell flag
|
||||||
|
@ -219,12 +235,18 @@ p
|
||||||
+cell flag
|
+cell flag
|
||||||
+cell Don't train NER.
|
+cell Don't train NER.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code --gold-preproc], #[code -G]
|
||||||
|
+cell flag
|
||||||
|
+cell Use gold preprocessing.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code --help], #[code -h]
|
+cell #[code --help], #[code -h]
|
||||||
+cell flag
|
+cell flag
|
||||||
+cell Show help message and available arguments.
|
+cell Show help message and available arguments.
|
||||||
|
|
||||||
+h(3, "train-hyperparams") Environment variables for hyperparameters
|
+h(4, "train-hyperparams") Environment variables for hyperparameters
|
||||||
|
+tag-new(2)
|
||||||
|
|
||||||
p
|
p
|
||||||
| spaCy lets you set hyperparameters for training via environment variables.
|
| spaCy lets you set hyperparameters for training via environment variables.
|
||||||
|
@ -236,98 +258,149 @@ p
|
||||||
+code(false, "bash").
|
+code(false, "bash").
|
||||||
parser_hidden_depth=2 parser_maxout_pieces=1 train-parser
|
parser_hidden_depth=2 parser_maxout_pieces=1 train-parser
|
||||||
|
|
||||||
+under-construction
|
|
||||||
|
|
||||||
+table(["Name", "Description", "Default"])
|
+table(["Name", "Description", "Default"])
|
||||||
+row
|
+row
|
||||||
+cell #[code dropout_from]
|
+cell #[code dropout_from]
|
||||||
+cell
|
+cell Initial dropout rate.
|
||||||
+cell #[code 0.2]
|
+cell #[code 0.2]
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code dropout_to]
|
+cell #[code dropout_to]
|
||||||
+cell
|
+cell Final dropout rate.
|
||||||
+cell #[code 0.2]
|
+cell #[code 0.2]
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code dropout_decay]
|
+cell #[code dropout_decay]
|
||||||
+cell
|
+cell Rate of dropout change.
|
||||||
+cell #[code 0.0]
|
+cell #[code 0.0]
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code batch_from]
|
+cell #[code batch_from]
|
||||||
+cell
|
+cell Initial batch size.
|
||||||
+cell #[code 1]
|
+cell #[code 1]
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code batch_to]
|
+cell #[code batch_to]
|
||||||
+cell
|
+cell Final batch size.
|
||||||
+cell #[code 64]
|
+cell #[code 64]
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code batch_compound]
|
+cell #[code batch_compound]
|
||||||
+cell
|
+cell Rate of batch size acceleration.
|
||||||
+cell #[code 1.001]
|
+cell #[code 1.001]
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code token_vector_width]
|
+cell #[code token_vector_width]
|
||||||
+cell
|
+cell Width of embedding tables and convolutional layers.
|
||||||
+cell #[code 128]
|
+cell #[code 128]
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code embed_size]
|
+cell #[code embed_size]
|
||||||
+cell
|
+cell Number of rows in embedding tables.
|
||||||
+cell #[code 7500]
|
+cell #[code 7500]
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code parser_maxout_pieces]
|
+cell #[code parser_maxout_pieces]
|
||||||
+cell
|
+cell Number of pieces in the parser's and NER's first maxout layer.
|
||||||
+cell #[code 2]
|
+cell #[code 2]
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code parser_hidden_depth]
|
+cell #[code parser_hidden_depth]
|
||||||
+cell
|
+cell Number of hidden layers in the parser and NER.
|
||||||
+cell #[code 1]
|
+cell #[code 1]
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code hidden_width]
|
+cell #[code hidden_width]
|
||||||
+cell
|
+cell Size of the parser's and NER's hidden layers.
|
||||||
+cell #[code 128]
|
+cell #[code 128]
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code learn_rate]
|
+cell #[code learn_rate]
|
||||||
+cell
|
+cell Learning rate.
|
||||||
+cell #[code 0.001]
|
+cell #[code 0.001]
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code optimizer_B1]
|
+cell #[code optimizer_B1]
|
||||||
+cell
|
+cell Momentum for the Adam solver.
|
||||||
+cell #[code 0.9]
|
+cell #[code 0.9]
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code optimizer_B2]
|
+cell #[code optimizer_B2]
|
||||||
+cell
|
+cell Adagrad-momentum for the Adam solver.
|
||||||
+cell #[code 0.999]
|
+cell #[code 0.999]
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code optimizer_eps]
|
+cell #[code optimizer_eps]
|
||||||
+cell
|
+cell Epsylon value for the Adam solver.
|
||||||
+cell #[code 1e-08]
|
+cell #[code 1e-08]
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code L2_penalty]
|
+cell #[code L2_penalty]
|
||||||
+cell
|
+cell L2 regularisation penalty.
|
||||||
+cell #[code 1e-06]
|
+cell #[code 1e-06]
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code grad_norm_clip]
|
+cell #[code grad_norm_clip]
|
||||||
+cell
|
+cell Gradient L2 norm constraint.
|
||||||
+cell #[code 1.0]
|
+cell #[code 1.0]
|
||||||
|
|
||||||
+h(2, "package") Package
|
+h(3, "evaluate") Evaluate
|
||||||
|
+tag-new(2)
|
||||||
|
|
||||||
p
|
p
|
||||||
| Generate a #[+a("/docs/usage/saving-loading#generating") model Python package]
|
| Evaluate a model's accuracy and speed on JSON-formatted annotated data.
|
||||||
|
| Will print the results and optionally export
|
||||||
|
| #[+a("/usage/visualizers") displaCy visualizations] of a sample set of
|
||||||
|
| parses to #[code .html] files. Visualizations for the dependency parse
|
||||||
|
| and NER will be exported as separate files if the respective component
|
||||||
|
| is present in the model's pipeline.
|
||||||
|
|
||||||
|
+code(false, "bash", "$", false, false, true).
|
||||||
|
spacy evaluate [model] [data_path] [--displacy-path] [--displacy-limit] [--gpu-id] [--gold-preproc]
|
||||||
|
|
||||||
|
+table(["Argument", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code model]
|
||||||
|
+cell positional
|
||||||
|
+cell
|
||||||
|
| Model to evaluate. Can be a package or shortcut link name, or a
|
||||||
|
| path to a model data directory.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code data_path]
|
||||||
|
+cell positional
|
||||||
|
+cell Location of JSON-formatted evaluation data.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code --displacy-path], #[code -dp]
|
||||||
|
+cell option
|
||||||
|
+cell
|
||||||
|
| Directory to output rendered parses as HTML. If not set, no
|
||||||
|
| visualizations will be generated.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code --displacy-limit], #[code -dl]
|
||||||
|
+cell option
|
||||||
|
+cell
|
||||||
|
| Number of parses to generate per file. Defaults to #[code 25].
|
||||||
|
| Keep in mind that a significantly higher number might cause the
|
||||||
|
| #[code .html] files to render slowly.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code --gpu-id], #[code -g]
|
||||||
|
+cell option
|
||||||
|
+cell GPU to use, if any. Defaults to #[code -1] for CPU.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code --gold-preproc], #[code -G]
|
||||||
|
+cell flag
|
||||||
|
+cell Use gold preprocessing.
|
||||||
|
|
||||||
|
|
||||||
|
+h(3, "package") Package
|
||||||
|
|
||||||
|
p
|
||||||
|
| Generate a #[+a("/usage/training#models-generating") model Python package]
|
||||||
| from an existing model data directory. All data files are copied over.
|
| from an existing model data directory. All data files are copied over.
|
||||||
| If the path to a meta.json is supplied, or a meta.json is found in the
|
| If the path to a meta.json is supplied, or a meta.json is found in the
|
||||||
| input directory, this file is used. Otherwise, the data can be entered
|
| input directory, this file is used. Otherwise, the data can be entered
|
||||||
|
@ -336,8 +409,8 @@ p
|
||||||
| sure you're always using the latest versions. This means you need to be
|
| sure you're always using the latest versions. This means you need to be
|
||||||
| connected to the internet to use this command.
|
| connected to the internet to use this command.
|
||||||
|
|
||||||
+code(false, "bash", "$").
|
+code(false, "bash", "$", false, false, true).
|
||||||
spacy package [input_dir] [output_dir] [--meta] [--force]
|
spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] [--force]
|
||||||
|
|
||||||
+table(["Argument", "Type", "Description"])
|
+table(["Argument", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
|
@ -353,14 +426,14 @@ p
|
||||||
+row
|
+row
|
||||||
+cell #[code --meta-path], #[code -m]
|
+cell #[code --meta-path], #[code -m]
|
||||||
+cell option
|
+cell option
|
||||||
+cell Path to meta.json file (optional).
|
+cell #[+tag-new(2)] Path to meta.json file (optional).
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code --create-meta], #[code -c]
|
+cell #[code --create-meta], #[code -c]
|
||||||
+cell flag
|
+cell flag
|
||||||
+cell
|
+cell
|
||||||
| Create a meta.json file on the command line, even if one already
|
| #[+tag-new(2)] Create a meta.json file on the command line, even
|
||||||
| exists in the directory.
|
| if one already exists in the directory.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code --force], #[code -f]
|
+cell #[code --force], #[code -f]
|
91
website/api/_top-level/_compat.jade
Normal file
91
website/api/_top-level/_compat.jade
Normal file
|
@ -0,0 +1,91 @@
|
||||||
|
//- 💫 DOCS > API > TOP-LEVEL > COMPATIBILITY
|
||||||
|
|
||||||
|
p
|
||||||
|
| All Python code is written in an
|
||||||
|
| #[strong intersection of Python 2 and Python 3]. This is easy in Cython,
|
||||||
|
| but somewhat ugly in Python. Logic that deals with Python or platform
|
||||||
|
| compatibility only lives in #[code spacy.compat]. To distinguish them from
|
||||||
|
| the builtin functions, replacement functions are suffixed with an
|
||||||
|
| undersocre, e.e #[code unicode_]. For specific checks, spaCy uses the
|
||||||
|
| #[code six] and #[code ftfy] packages.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
from spacy.compat import unicode_, json_dumps
|
||||||
|
|
||||||
|
compatible_unicode = unicode_('hello world')
|
||||||
|
compatible_json = json_dumps({'key': 'value'})
|
||||||
|
|
||||||
|
+table(["Name", "Python 2", "Python 3"])
|
||||||
|
+row
|
||||||
|
+cell #[code compat.bytes_]
|
||||||
|
+cell #[code str]
|
||||||
|
+cell #[code bytes]
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code compat.unicode_]
|
||||||
|
+cell #[code unicode]
|
||||||
|
+cell #[code str]
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code compat.basestring_]
|
||||||
|
+cell #[code basestring]
|
||||||
|
+cell #[code str]
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code compat.input_]
|
||||||
|
+cell #[code raw_input]
|
||||||
|
+cell #[code input]
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code compat.json_dumps]
|
||||||
|
+cell #[code ujson.dumps] with #[code .decode('utf8')]
|
||||||
|
+cell #[code ujson.dumps]
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code compat.path2str]
|
||||||
|
+cell #[code str(path)] with #[code .decode('utf8')]
|
||||||
|
+cell #[code str(path)]
|
||||||
|
|
||||||
|
+h(3, "is_config") compat.is_config
|
||||||
|
+tag function
|
||||||
|
|
||||||
|
p
|
||||||
|
| Check if a specific configuration of Python version and operating system
|
||||||
|
| matches the user's setup. Mostly used to display targeted error messages.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
from spacy.compat import is_config
|
||||||
|
|
||||||
|
if is_config(python2=True, windows=True):
|
||||||
|
print("You are using Python 2 on Windows.")
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code python2]
|
||||||
|
+cell bool
|
||||||
|
+cell spaCy is executed with Python 2.x.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code python3]
|
||||||
|
+cell bool
|
||||||
|
+cell spaCy is executed with Python 3.x.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code windows]
|
||||||
|
+cell bool
|
||||||
|
+cell spaCy is executed on Windows.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code linux]
|
||||||
|
+cell bool
|
||||||
|
+cell spaCy is executed on Linux.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code osx]
|
||||||
|
+cell bool
|
||||||
|
+cell spaCy is executed on OS X or macOS.
|
||||||
|
|
||||||
|
+row("foot")
|
||||||
|
+cell returns
|
||||||
|
+cell bool
|
||||||
|
+cell Whether the specified configuration matches the user's platform.
|
|
@ -1,14 +1,12 @@
|
||||||
//- 💫 DOCS > API > DISPLACY
|
//- 💫 DOCS > API > TOP-LEVEL > DISPLACY
|
||||||
|
|
||||||
include ../../_includes/_mixins
|
|
||||||
|
|
||||||
p
|
p
|
||||||
| As of v2.0, spaCy comes with a built-in visualization suite. For more
|
| As of v2.0, spaCy comes with a built-in visualization suite. For more
|
||||||
| info and examples, see the usage guide on
|
| info and examples, see the usage guide on
|
||||||
| #[+a("/docs/usage/visualizers") visualizing spaCy].
|
| #[+a("/usage/visualizers") visualizing spaCy].
|
||||||
|
|
||||||
|
|
||||||
+h(2, "serve") displacy.serve
|
+h(3, "displacy.serve") displacy.serve
|
||||||
+tag method
|
+tag method
|
||||||
+tag-new(2)
|
+tag-new(2)
|
||||||
|
|
||||||
|
@ -60,7 +58,7 @@ p
|
||||||
+cell bool
|
+cell bool
|
||||||
+cell
|
+cell
|
||||||
| Don't parse #[code Doc] and instead, expect a dict or list of
|
| Don't parse #[code Doc] and instead, expect a dict or list of
|
||||||
| dicts. #[+a("/docs/usage/visualizers#manual-usage") See here]
|
| dicts. #[+a("/usage/visualizers#manual-usage") See here]
|
||||||
| for formats and examples.
|
| for formats and examples.
|
||||||
+cell #[code False]
|
+cell #[code False]
|
||||||
|
|
||||||
|
@ -70,7 +68,7 @@ p
|
||||||
+cell Port to serve visualization.
|
+cell Port to serve visualization.
|
||||||
+cell #[code 5000]
|
+cell #[code 5000]
|
||||||
|
|
||||||
+h(2, "render") displacy.render
|
+h(3, "displacy.render") displacy.render
|
||||||
+tag method
|
+tag method
|
||||||
+tag-new(2)
|
+tag-new(2)
|
||||||
|
|
||||||
|
@ -127,24 +125,24 @@ p Render a dependency parse tree or named entity visualization.
|
||||||
+cell bool
|
+cell bool
|
||||||
+cell
|
+cell
|
||||||
| Don't parse #[code Doc] and instead, expect a dict or list of
|
| Don't parse #[code Doc] and instead, expect a dict or list of
|
||||||
| dicts. #[+a("/docs/usage/visualizers#manual-usage") See here]
|
| dicts. #[+a("/usage/visualizers#manual-usage") See here]
|
||||||
| for formats and examples.
|
| for formats and examples.
|
||||||
+cell #[code False]
|
+cell #[code False]
|
||||||
|
|
||||||
+footrow
|
+row("foot")
|
||||||
+cell returns
|
+cell returns
|
||||||
+cell unicode
|
+cell unicode
|
||||||
+cell Rendered HTML markup.
|
+cell Rendered HTML markup.
|
||||||
+cell
|
+cell
|
||||||
|
|
||||||
+h(2, "options") Visualizer options
|
+h(3, "displacy_options") Visualizer options
|
||||||
|
|
||||||
p
|
p
|
||||||
| The #[code options] argument lets you specify additional settings for
|
| The #[code options] argument lets you specify additional settings for
|
||||||
| each visualizer. If a setting is not present in the options, the default
|
| each visualizer. If a setting is not present in the options, the default
|
||||||
| value will be used.
|
| value will be used.
|
||||||
|
|
||||||
+h(3, "options-dep") Dependency Visualizer options
|
+h(4, "options-dep") Dependency Visualizer options
|
||||||
|
|
||||||
+aside-code("Example").
|
+aside-code("Example").
|
||||||
options = {'compact': True, 'color': 'blue'}
|
options = {'compact': True, 'color': 'blue'}
|
||||||
|
@ -219,7 +217,7 @@ p
|
||||||
+cell Distance between words in px.
|
+cell Distance between words in px.
|
||||||
+cell #[code 175] / #[code 85] (compact)
|
+cell #[code 175] / #[code 85] (compact)
|
||||||
|
|
||||||
+h(3, "options-ent") Named Entity Visualizer options
|
+h(4, "displacy_options-ent") Named Entity Visualizer options
|
||||||
|
|
||||||
+aside-code("Example").
|
+aside-code("Example").
|
||||||
options = {'ents': ['PERSON', 'ORG', 'PRODUCT'],
|
options = {'ents': ['PERSON', 'ORG', 'PRODUCT'],
|
||||||
|
@ -244,6 +242,6 @@ p
|
||||||
|
|
||||||
p
|
p
|
||||||
| By default, displaCy comes with colours for all
|
| By default, displaCy comes with colours for all
|
||||||
| #[+a("/docs/api/annotation#named-entities") entity types supported by spaCy].
|
| #[+a("/api/annotation#named-entities") entity types supported by spaCy].
|
||||||
| If you're using custom entity types, you can use the #[code colors]
|
| If you're using custom entity types, you can use the #[code colors]
|
||||||
| setting to add your own colours for them.
|
| setting to add your own colours for them.
|
|
@ -1,15 +1,13 @@
|
||||||
//- 💫 DOCS > API > SPACY
|
//- 💫 DOCS > API > TOP-LEVEL > SPACY
|
||||||
|
|
||||||
include ../../_includes/_mixins
|
+h(3, "spacy.load") spacy.load
|
||||||
|
|
||||||
+h(2, "load") spacy.load
|
|
||||||
+tag function
|
+tag function
|
||||||
+tag-model
|
+tag-model
|
||||||
|
|
||||||
p
|
p
|
||||||
| Load a model via its #[+a("/docs/usage/models#usage") shortcut link],
|
| Load a model via its #[+a("/usage/models#usage") shortcut link],
|
||||||
| the name of an installed
|
| the name of an installed
|
||||||
| #[+a("/docs/usage/saving-loading#generating") model package], a unicode
|
| #[+a("/usage/training#models-generating") model package], a unicode
|
||||||
| path or a #[code Path]-like object. spaCy will try resolving the load
|
| path or a #[code Path]-like object. spaCy will try resolving the load
|
||||||
| argument in this order. If a model is loaded from a shortcut link or
|
| argument in this order. If a model is loaded from a shortcut link or
|
||||||
| package name, spaCy will assume it's a Python package and import it and
|
| package name, spaCy will assume it's a Python package and import it and
|
||||||
|
@ -38,25 +36,57 @@ p
|
||||||
+cell list
|
+cell list
|
||||||
+cell
|
+cell
|
||||||
| Names of pipeline components to
|
| Names of pipeline components to
|
||||||
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable].
|
| #[+a("/usage/processing-pipelines#disabling") disable].
|
||||||
|
|
||||||
+footrow
|
+row("foot")
|
||||||
+cell returns
|
+cell returns
|
||||||
+cell #[code Language]
|
+cell #[code Language]
|
||||||
+cell A #[code Language] object with the loaded model.
|
+cell A #[code Language] object with the loaded model.
|
||||||
|
|
||||||
+infobox("⚠️ Deprecation note")
|
+infobox("Deprecation note", "⚠️")
|
||||||
.o-block
|
.o-block
|
||||||
| As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy
|
| As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy
|
||||||
| will also raise an error if no model could be loaded and never just
|
| will also raise an error if no model could be loaded and never just
|
||||||
| return an empty #[code Language] object. If you need a blank language,
|
| return an empty #[code Language] object. If you need a blank language,
|
||||||
| you need to import it explicitly (#[code from spacy.lang.en import English])
|
| you can use the new function #[+api("spacy#blank") #[code spacy.blank()]]
|
||||||
| or use #[+api("util#get_lang_class") #[code util.get_lang_class]].
|
| or import the class explicitly, e.g.
|
||||||
|
| #[code from spacy.lang.en import English].
|
||||||
|
|
||||||
+code-new nlp = spacy.load('/model')
|
+code-new nlp = spacy.load('/model')
|
||||||
+code-old nlp = spacy.load('en', path='/model')
|
+code-old nlp = spacy.load('en', path='/model')
|
||||||
|
|
||||||
+h(2, "info") spacy.info
|
+h(3, "spacy.blank") spacy.blank
|
||||||
|
+tag function
|
||||||
|
+tag-new(2)
|
||||||
|
|
||||||
|
p
|
||||||
|
| Create a blank model of a given language class. This function is the
|
||||||
|
| twin of #[code spacy.load()].
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
nlp_en = spacy.blank('en')
|
||||||
|
nlp_de = spacy.blank('de')
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code name]
|
||||||
|
+cell unicode
|
||||||
|
+cell ISO code of the language class to load.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code disable]
|
||||||
|
+cell list
|
||||||
|
+cell
|
||||||
|
| Names of pipeline components to
|
||||||
|
| #[+a("/usage/processing-pipelines#disabling") disable].
|
||||||
|
|
||||||
|
+row("foot")
|
||||||
|
+cell returns
|
||||||
|
+cell #[code Language]
|
||||||
|
+cell An empty #[code Language] object of the appropriate subclass.
|
||||||
|
|
||||||
|
|
||||||
|
+h(4, "spacy.info") spacy.info
|
||||||
+tag function
|
+tag function
|
||||||
|
|
||||||
p
|
p
|
||||||
|
@ -83,13 +113,13 @@ p
|
||||||
+cell Print information as Markdown.
|
+cell Print information as Markdown.
|
||||||
|
|
||||||
|
|
||||||
+h(2, "explain") spacy.explain
|
+h(3, "spacy.explain") spacy.explain
|
||||||
+tag function
|
+tag function
|
||||||
|
|
||||||
p
|
p
|
||||||
| Get a description for a given POS tag, dependency label or entity type.
|
| Get a description for a given POS tag, dependency label or entity type.
|
||||||
| For a list of available terms, see
|
| For a list of available terms, see
|
||||||
| #[+src(gh("spacy", "spacy/glossary.py")) glossary.py].
|
| #[+src(gh("spacy", "spacy/glossary.py")) #[code glossary.py]].
|
||||||
|
|
||||||
+aside-code("Example").
|
+aside-code("Example").
|
||||||
spacy.explain('NORP')
|
spacy.explain('NORP')
|
||||||
|
@ -107,18 +137,18 @@ p
|
||||||
+cell unicode
|
+cell unicode
|
||||||
+cell Term to explain.
|
+cell Term to explain.
|
||||||
|
|
||||||
+footrow
|
+row("foot")
|
||||||
+cell returns
|
+cell returns
|
||||||
+cell unicode
|
+cell unicode
|
||||||
+cell The explanation, or #[code None] if not found in the glossary.
|
+cell The explanation, or #[code None] if not found in the glossary.
|
||||||
|
|
||||||
+h(2, "set_factory") spacy.set_factory
|
+h(3, "spacy.set_factory") spacy.set_factory
|
||||||
+tag function
|
+tag function
|
||||||
+tag-new(2)
|
+tag-new(2)
|
||||||
|
|
||||||
p
|
p
|
||||||
| Set a factory that returns a custom
|
| Set a factory that returns a custom
|
||||||
| #[+a("/docs/usage/language-processing-pipeline") processing pipeline]
|
| #[+a("/usage/processing-pipelines") processing pipeline]
|
||||||
| component. Factories are useful for creating stateful components, especially ones which depend on shared data.
|
| component. Factories are useful for creating stateful components, especially ones which depend on shared data.
|
||||||
|
|
||||||
+aside-code("Example").
|
+aside-code("Example").
|
|
@ -1,10 +1,8 @@
|
||||||
//- 💫 DOCS > API > UTIL
|
//- 💫 DOCS > API > TOP-LEVEL > UTIL
|
||||||
|
|
||||||
include ../../_includes/_mixins
|
|
||||||
|
|
||||||
p
|
p
|
||||||
| spaCy comes with a small collection of utility functions located in
|
| spaCy comes with a small collection of utility functions located in
|
||||||
| #[+src(gh("spaCy", "spacy/util.py")) spacy/util.py].
|
| #[+src(gh("spaCy", "spacy/util.py")) #[code spacy/util.py]].
|
||||||
| Because utility functions are mostly intended for
|
| Because utility functions are mostly intended for
|
||||||
| #[strong internal use within spaCy], their behaviour may change with
|
| #[strong internal use within spaCy], their behaviour may change with
|
||||||
| future releases. The functions documented on this page should be safe
|
| future releases. The functions documented on this page should be safe
|
||||||
|
@ -12,7 +10,7 @@ p
|
||||||
| recommend having additional tests in place if your application depends on
|
| recommend having additional tests in place if your application depends on
|
||||||
| any of spaCy's utilities.
|
| any of spaCy's utilities.
|
||||||
|
|
||||||
+h(2, "get_data_path") util.get_data_path
|
+h(3, "util.get_data_path") util.get_data_path
|
||||||
+tag function
|
+tag function
|
||||||
|
|
||||||
p
|
p
|
||||||
|
@ -25,12 +23,12 @@ p
|
||||||
+cell bool
|
+cell bool
|
||||||
+cell Only return path if it exists, otherwise return #[code None].
|
+cell Only return path if it exists, otherwise return #[code None].
|
||||||
|
|
||||||
+footrow
|
+row("foot")
|
||||||
+cell returns
|
+cell returns
|
||||||
+cell #[code Path] / #[code None]
|
+cell #[code Path] / #[code None]
|
||||||
+cell Data path or #[code None].
|
+cell Data path or #[code None].
|
||||||
|
|
||||||
+h(2, "set_data_path") util.set_data_path
|
+h(3, "util.set_data_path") util.set_data_path
|
||||||
+tag function
|
+tag function
|
||||||
|
|
||||||
p
|
p
|
||||||
|
@ -47,12 +45,12 @@ p
|
||||||
+cell unicode or #[code Path]
|
+cell unicode or #[code Path]
|
||||||
+cell Path to new data directory.
|
+cell Path to new data directory.
|
||||||
|
|
||||||
+h(2, "get_lang_class") util.get_lang_class
|
+h(3, "util.get_lang_class") util.get_lang_class
|
||||||
+tag function
|
+tag function
|
||||||
|
|
||||||
p
|
p
|
||||||
| Import and load a #[code Language] class. Allows lazy-loading
|
| Import and load a #[code Language] class. Allows lazy-loading
|
||||||
| #[+a("/docs/usage/adding-languages") language data] and importing
|
| #[+a("/usage/adding-languages") language data] and importing
|
||||||
| languages using the two-letter language code.
|
| languages using the two-letter language code.
|
||||||
|
|
||||||
+aside-code("Example").
|
+aside-code("Example").
|
||||||
|
@ -67,12 +65,12 @@ p
|
||||||
+cell unicode
|
+cell unicode
|
||||||
+cell Two-letter language code, e.g. #[code 'en'].
|
+cell Two-letter language code, e.g. #[code 'en'].
|
||||||
|
|
||||||
+footrow
|
+row("foot")
|
||||||
+cell returns
|
+cell returns
|
||||||
+cell #[code Language]
|
+cell #[code Language]
|
||||||
+cell Language class.
|
+cell Language class.
|
||||||
|
|
||||||
+h(2, "load_model") util.load_model
|
+h(3, "util.load_model") util.load_model
|
||||||
+tag function
|
+tag function
|
||||||
+tag-new(2)
|
+tag-new(2)
|
||||||
|
|
||||||
|
@ -101,12 +99,12 @@ p
|
||||||
+cell -
|
+cell -
|
||||||
+cell Specific overrides, like pipeline components to disable.
|
+cell Specific overrides, like pipeline components to disable.
|
||||||
|
|
||||||
+footrow
|
+row("foot")
|
||||||
+cell returns
|
+cell returns
|
||||||
+cell #[code Language]
|
+cell #[code Language]
|
||||||
+cell #[code Language] class with the loaded model.
|
+cell #[code Language] class with the loaded model.
|
||||||
|
|
||||||
+h(2, "load_model_from_path") util.load_model_from_path
|
+h(3, "util.load_model_from_path") util.load_model_from_path
|
||||||
+tag function
|
+tag function
|
||||||
+tag-new(2)
|
+tag-new(2)
|
||||||
|
|
||||||
|
@ -139,18 +137,18 @@ p
|
||||||
+cell -
|
+cell -
|
||||||
+cell Specific overrides, like pipeline components to disable.
|
+cell Specific overrides, like pipeline components to disable.
|
||||||
|
|
||||||
+footrow
|
+row("foot")
|
||||||
+cell returns
|
+cell returns
|
||||||
+cell #[code Language]
|
+cell #[code Language]
|
||||||
+cell #[code Language] class with the loaded model.
|
+cell #[code Language] class with the loaded model.
|
||||||
|
|
||||||
+h(2, "load_model_from_init_py") util.load_model_from_init_py
|
+h(3, "util.load_model_from_init_py") util.load_model_from_init_py
|
||||||
+tag function
|
+tag function
|
||||||
+tag-new(2)
|
+tag-new(2)
|
||||||
|
|
||||||
p
|
p
|
||||||
| A helper function to use in the #[code load()] method of a model package's
|
| A helper function to use in the #[code load()] method of a model package's
|
||||||
| #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) __init__.py].
|
| #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) #[code __init__.py]].
|
||||||
|
|
||||||
+aside-code("Example").
|
+aside-code("Example").
|
||||||
from spacy.util import load_model_from_init_py
|
from spacy.util import load_model_from_init_py
|
||||||
|
@ -169,12 +167,12 @@ p
|
||||||
+cell -
|
+cell -
|
||||||
+cell Specific overrides, like pipeline components to disable.
|
+cell Specific overrides, like pipeline components to disable.
|
||||||
|
|
||||||
+footrow
|
+row("foot")
|
||||||
+cell returns
|
+cell returns
|
||||||
+cell #[code Language]
|
+cell #[code Language]
|
||||||
+cell #[code Language] class with the loaded model.
|
+cell #[code Language] class with the loaded model.
|
||||||
|
|
||||||
+h(2, "get_model_meta") util.get_model_meta
|
+h(3, "util.get_model_meta") util.get_model_meta
|
||||||
+tag function
|
+tag function
|
||||||
+tag-new(2)
|
+tag-new(2)
|
||||||
|
|
||||||
|
@ -190,17 +188,17 @@ p
|
||||||
+cell unicode or #[code Path]
|
+cell unicode or #[code Path]
|
||||||
+cell Path to model directory.
|
+cell Path to model directory.
|
||||||
|
|
||||||
+footrow
|
+row("foot")
|
||||||
+cell returns
|
+cell returns
|
||||||
+cell dict
|
+cell dict
|
||||||
+cell The model's meta data.
|
+cell The model's meta data.
|
||||||
|
|
||||||
+h(2, "is_package") util.is_package
|
+h(3, "util.is_package") util.is_package
|
||||||
+tag function
|
+tag function
|
||||||
|
|
||||||
p
|
p
|
||||||
| Check if string maps to a package installed via pip. Mainly used to
|
| Check if string maps to a package installed via pip. Mainly used to
|
||||||
| validate #[+a("/docs/usage/models") model packages].
|
| validate #[+a("/usage/models") model packages].
|
||||||
|
|
||||||
+aside-code("Example").
|
+aside-code("Example").
|
||||||
util.is_package('en_core_web_sm') # True
|
util.is_package('en_core_web_sm') # True
|
||||||
|
@ -212,18 +210,18 @@ p
|
||||||
+cell unicode
|
+cell unicode
|
||||||
+cell Name of package.
|
+cell Name of package.
|
||||||
|
|
||||||
+footrow
|
+row("foot")
|
||||||
+cell returns
|
+cell returns
|
||||||
+cell #[code bool]
|
+cell #[code bool]
|
||||||
+cell #[code True] if installed package, #[code False] if not.
|
+cell #[code True] if installed package, #[code False] if not.
|
||||||
|
|
||||||
+h(2, "get_package_path") util.get_package_path
|
+h(3, "util.get_package_path") util.get_package_path
|
||||||
+tag function
|
+tag function
|
||||||
+tag-new(2)
|
+tag-new(2)
|
||||||
|
|
||||||
p
|
p
|
||||||
| Get path to an installed package. Mainly used to resolve the location of
|
| Get path to an installed package. Mainly used to resolve the location of
|
||||||
| #[+a("/docs/usage/models") model packages]. Currently imports the package
|
| #[+a("/usage/models") model packages]. Currently imports the package
|
||||||
| to find its path.
|
| to find its path.
|
||||||
|
|
||||||
+aside-code("Example").
|
+aside-code("Example").
|
||||||
|
@ -236,12 +234,12 @@ p
|
||||||
+cell unicode
|
+cell unicode
|
||||||
+cell Name of installed package.
|
+cell Name of installed package.
|
||||||
|
|
||||||
+footrow
|
+row("foot")
|
||||||
+cell returns
|
+cell returns
|
||||||
+cell #[code Path]
|
+cell #[code Path]
|
||||||
+cell Path to model package directory.
|
+cell Path to model package directory.
|
||||||
|
|
||||||
+h(2, "is_in_jupyter") util.is_in_jupyter
|
+h(3, "util.is_in_jupyter") util.is_in_jupyter
|
||||||
+tag function
|
+tag function
|
||||||
+tag-new(2)
|
+tag-new(2)
|
||||||
|
|
||||||
|
@ -257,17 +255,17 @@ p
|
||||||
return display(HTML(html))
|
return display(HTML(html))
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+footrow
|
+row("foot")
|
||||||
+cell returns
|
+cell returns
|
||||||
+cell bool
|
+cell bool
|
||||||
+cell #[code True] if in Jupyter, #[code False] if not.
|
+cell #[code True] if in Jupyter, #[code False] if not.
|
||||||
|
|
||||||
+h(2, "update_exc") util.update_exc
|
+h(3, "util.update_exc") util.update_exc
|
||||||
+tag function
|
+tag function
|
||||||
|
|
||||||
p
|
p
|
||||||
| Update, validate and overwrite
|
| Update, validate and overwrite
|
||||||
| #[+a("/docs/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions].
|
| #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions].
|
||||||
| Used to combine global exceptions with custom, language-specific
|
| Used to combine global exceptions with custom, language-specific
|
||||||
| exceptions. Will raise an error if key doesn't match #[code ORTH] values.
|
| exceptions. Will raise an error if key doesn't match #[code ORTH] values.
|
||||||
|
|
||||||
|
@ -288,20 +286,20 @@ p
|
||||||
+cell dicts
|
+cell dicts
|
||||||
+cell Exception dictionaries to add to the base exceptions, in order.
|
+cell Exception dictionaries to add to the base exceptions, in order.
|
||||||
|
|
||||||
+footrow
|
+row("foot")
|
||||||
+cell returns
|
+cell returns
|
||||||
+cell dict
|
+cell dict
|
||||||
+cell Combined tokenizer exceptions.
|
+cell Combined tokenizer exceptions.
|
||||||
|
|
||||||
|
|
||||||
+h(2, "prints") util.prints
|
+h(3, "util.prints") util.prints
|
||||||
+tag function
|
+tag function
|
||||||
+tag-new(2)
|
+tag-new(2)
|
||||||
|
|
||||||
p
|
p
|
||||||
| Print a formatted, text-wrapped message with optional title. If a text
|
| Print a formatted, text-wrapped message with optional title. If a text
|
||||||
| argument is a #[code Path], it's converted to a string. Should only
|
| argument is a #[code Path], it's converted to a string. Should only
|
||||||
| be used for interactive components like the #[+api("cli") cli].
|
| be used for interactive components like the command-line interface.
|
||||||
|
|
||||||
+aside-code("Example").
|
+aside-code("Example").
|
||||||
data_path = Path('/some/path')
|
data_path = Path('/some/path')
|
131
website/api/annotation.jade
Normal file
131
website/api/annotation.jade
Normal file
|
@ -0,0 +1,131 @@
|
||||||
|
//- 💫 DOCS > API > ANNOTATION SPECS
|
||||||
|
|
||||||
|
include ../_includes/_mixins
|
||||||
|
|
||||||
|
p This document describes the target annotations spaCy is trained to predict.
|
||||||
|
|
||||||
|
|
||||||
|
+section("tokenization")
|
||||||
|
+h(2, "tokenization") Tokenization
|
||||||
|
|
||||||
|
p
|
||||||
|
| Tokenization standards are based on the
|
||||||
|
| #[+a("https://catalog.ldc.upenn.edu/LDC2013T19") OntoNotes 5] corpus.
|
||||||
|
| The tokenizer differs from most by including tokens for significant
|
||||||
|
| whitespace. Any sequence of whitespace characters beyond a single space
|
||||||
|
| (#[code ' ']) is included as a token.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
from spacy.lang.en import English
|
||||||
|
nlp = English()
|
||||||
|
tokens = nlp('Some\nspaces and\ttab characters')
|
||||||
|
tokens_text = [t.text for t in tokens]
|
||||||
|
assert tokens_text == ['Some', '\n', 'spaces', ' ', 'and',
|
||||||
|
'\t', 'tab', 'characters']
|
||||||
|
|
||||||
|
p
|
||||||
|
| The whitespace tokens are useful for much the same reason punctuation is
|
||||||
|
| – it's often an important delimiter in the text. By preserving it in the
|
||||||
|
| token output, we are able to maintain a simple alignment between the
|
||||||
|
| tokens and the original string, and we ensure that no information is
|
||||||
|
| lost during processing.
|
||||||
|
|
||||||
|
+section("sbd")
|
||||||
|
+h(2, "sentence-boundary") Sentence boundary detection
|
||||||
|
|
||||||
|
p
|
||||||
|
| Sentence boundaries are calculated from the syntactic parse tree, so
|
||||||
|
| features such as punctuation and capitalisation play an important but
|
||||||
|
| non-decisive role in determining the sentence boundaries. Usually this
|
||||||
|
| means that the sentence boundaries will at least coincide with clause
|
||||||
|
| boundaries, even given poorly punctuated text.
|
||||||
|
|
||||||
|
+section("pos-tagging")
|
||||||
|
+h(2, "pos-tagging") Part-of-speech Tagging
|
||||||
|
|
||||||
|
+aside("Tip: Understanding tags")
|
||||||
|
| You can also use #[code spacy.explain()] to get the description for the
|
||||||
|
| string representation of a tag. For example,
|
||||||
|
| #[code spacy.explain("RB")] will return "adverb".
|
||||||
|
|
||||||
|
include _annotation/_pos-tags
|
||||||
|
|
||||||
|
+section("lemmatization")
|
||||||
|
+h(2, "lemmatization") Lemmatization
|
||||||
|
|
||||||
|
p A "lemma" is the uninflected form of a word. In English, this means:
|
||||||
|
|
||||||
|
+list
|
||||||
|
+item #[strong Adjectives]: The form like "happy", not "happier" or "happiest"
|
||||||
|
+item #[strong Adverbs]: The form like "badly", not "worse" or "worst"
|
||||||
|
+item #[strong Nouns]: The form like "dog", not "dogs"; like "child", not "children"
|
||||||
|
+item #[strong Verbs]: The form like "write", not "writes", "writing", "wrote" or "written"
|
||||||
|
|
||||||
|
p
|
||||||
|
| The lemmatization data is taken from
|
||||||
|
| #[+a("https://wordnet.princeton.edu") WordNet]. However, we also add a
|
||||||
|
| special case for pronouns: all pronouns are lemmatized to the special
|
||||||
|
| token #[code -PRON-].
|
||||||
|
|
||||||
|
+infobox("About spaCy's custom pronoun lemma")
|
||||||
|
| Unlike verbs and common nouns, there's no clear base form of a personal
|
||||||
|
| pronoun. Should the lemma of "me" be "I", or should we normalize person
|
||||||
|
| as well, giving "it" — or maybe "he"? spaCy's solution is to introduce a
|
||||||
|
| novel symbol, #[code -PRON-], which is used as the lemma for
|
||||||
|
| all personal pronouns.
|
||||||
|
|
||||||
|
+section("dependency-parsing")
|
||||||
|
+h(2, "dependency-parsing") Syntactic Dependency Parsing
|
||||||
|
|
||||||
|
+aside("Tip: Understanding labels")
|
||||||
|
| You can also use #[code spacy.explain()] to get the description for the
|
||||||
|
| string representation of a label. For example,
|
||||||
|
| #[code spacy.explain("prt")] will return "particle".
|
||||||
|
|
||||||
|
include _annotation/_dep-labels
|
||||||
|
|
||||||
|
+section("named-entities")
|
||||||
|
+h(2, "named-entities") Named Entity Recognition
|
||||||
|
|
||||||
|
+aside("Tip: Understanding entity types")
|
||||||
|
| You can also use #[code spacy.explain()] to get the description for the
|
||||||
|
| string representation of an entity label. For example,
|
||||||
|
| #[code spacy.explain("LANGUAGE")] will return "any named language".
|
||||||
|
|
||||||
|
include _annotation/_named-entities
|
||||||
|
|
||||||
|
+h(3, "biluo") BILUO Scheme
|
||||||
|
|
||||||
|
include _annotation/_biluo
|
||||||
|
|
||||||
|
+section("training")
|
||||||
|
+h(2, "json-input") JSON input format for training
|
||||||
|
|
||||||
|
+under-construction
|
||||||
|
|
||||||
|
p spaCy takes training data in the following format:
|
||||||
|
|
||||||
|
+code("Example structure").
|
||||||
|
doc: {
|
||||||
|
id: string,
|
||||||
|
paragraphs: [{
|
||||||
|
raw: string,
|
||||||
|
sents: [int],
|
||||||
|
tokens: [{
|
||||||
|
start: int,
|
||||||
|
tag: string,
|
||||||
|
head: int,
|
||||||
|
dep: string
|
||||||
|
}],
|
||||||
|
ner: [{
|
||||||
|
start: int,
|
||||||
|
end: int,
|
||||||
|
label: string
|
||||||
|
}],
|
||||||
|
brackets: [{
|
||||||
|
start: int,
|
||||||
|
end: int,
|
||||||
|
label: string
|
||||||
|
}]
|
||||||
|
}]
|
||||||
|
}
|
|
@ -1,6 +1,6 @@
|
||||||
//- 💫 DOCS > API > BINDER
|
//- 💫 DOCS > API > BINDER
|
||||||
|
|
||||||
include ../../_includes/_mixins
|
include ../_includes/_mixins
|
||||||
|
|
||||||
p A container class for serializing collections of #[code Doc] objects.
|
p A container class for serializing collections of #[code Doc] objects.
|
||||||
|
|
5
website/api/dependencyparser.jade
Normal file
5
website/api/dependencyparser.jade
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
//- 💫 DOCS > API > DEPENDENCYPARSER
|
||||||
|
|
||||||
|
include ../_includes/_mixins
|
||||||
|
|
||||||
|
!=partial("pipe", { subclass: "DependencyParser", short: "parser", pipeline_id: "parser" })
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user