Wrap try/except around model saving

This commit is contained in:
Matthew Honnibal 2017-10-05 08:14:24 -05:00
commit c6cd81f192
331 changed files with 10443 additions and 10377 deletions

View File

@ -1 +1,55 @@
environment:
matrix:
# For Python versions available on Appveyor, see
# http://www.appveyor.com/docs/installed-software#python
# The list here is complete (excluding Python 2.6, which
# isn't covered by this document) at the time of writing.
- PYTHON: "C:\\Python27"
#- PYTHON: "C:\\Python33"
#- PYTHON: "C:\\Python34"
#- PYTHON: "C:\\Python35"
#- PYTHON: "C:\\Python27-x64"
#- PYTHON: "C:\\Python33-x64"
#- DISTUTILS_USE_SDK: "1"
#- PYTHON: "C:\\Python34-x64"
#- DISTUTILS_USE_SDK: "1"
#- PYTHON: "C:\\Python35-x64"
- PYTHON: "C:\\Python36-x64"
install:
# We need wheel installed to build wheels
- "%PYTHON%\\python.exe -m pip install wheel"
- "%PYTHON%\\python.exe -m pip install cython"
- "%PYTHON%\\python.exe -m pip install -r requirements.txt"
- "%PYTHON%\\python.exe -m pip install -e ."
build: off
test_script:
# Put your test command here.
# If you don't need to build C extensions on 64-bit Python 3.3 or 3.4,
# you can remove "build.cmd" from the front of the command, as it's
# only needed to support those cases.
# Note that you must use the environment variable %PYTHON% to refer to
# the interpreter you're using - Appveyor does not do anything special
# to put the Python version you want to use on PATH.
- "%PYTHON%\\python.exe -m pytest spacy/"
after_test:
# This step builds your wheels.
# Again, you only need build.cmd if you're building C extensions for
# 64-bit Python 3.3/3.4. And you need to use %PYTHON% to get the correct
# interpreter
- "%PYTHON%\\python.exe setup.py bdist_wheel"
artifacts:
# bdist_wheel puts your built wheel in the dist directory
- path: dist\*
#on_success:
# You can use this step to upload your artifacts to a public website.
# See Appveyor's documentation for more details. Or you can simply
# access your wheels from the Appveyor "artifacts" tab for your build.

11
.buildkite/sdist.yml Normal file
View File

@ -0,0 +1,11 @@
steps:
-
command: "fab env clean make test sdist"
label: ":dizzy: :python:"
artifact_paths: "dist/*.tar.gz"
- wait
- trigger: "spacy-sdist-against-models"
label: ":dizzy: :hammer:"
build:
env:
SPACY_VERSION: "{$SPACY_VERSION}"

4
.gitignore vendored
View File

@ -1,14 +1,12 @@
# spaCy
spacy/data/
corpora/
models/
/models/
keys/
# Website
website/www/
website/_deploy.sh
website/package.json
website/announcement.jade
website/.gitignore
# Cython / C extensions

View File

@ -1,322 +0,0 @@
'''WIP --- Doesn't work well yet'''
import plac
import random
import six
import cProfile
import pstats
import pathlib
import cPickle as pickle
from itertools import izip
import spacy
import cytoolz
import cupy as xp
import cupy.cuda
import chainer.cuda
import chainer.links as L
import chainer.functions as F
from chainer import Chain, Variable, report
import chainer.training
import chainer.optimizers
from chainer.training import extensions
from chainer.iterators import SerialIterator
from chainer.datasets import TupleDataset
class SentimentAnalyser(object):
@classmethod
def load(cls, path, nlp, max_length=100):
raise NotImplementedError
#with (path / 'config.json').open() as file_:
# model = model_from_json(file_.read())
#with (path / 'model').open('rb') as file_:
# lstm_weights = pickle.load(file_)
#embeddings = get_embeddings(nlp.vocab)
#model.set_weights([embeddings] + lstm_weights)
#return cls(model, max_length=max_length)
def __init__(self, model, max_length=100):
self._model = model
self.max_length = max_length
def __call__(self, doc):
X = get_features([doc], self.max_length)
y = self._model.predict(X)
self.set_sentiment(doc, y)
def pipe(self, docs, batch_size=1000, n_threads=2):
for minibatch in cytoolz.partition_all(batch_size, docs):
minibatch = list(minibatch)
sentences = []
for doc in minibatch:
sentences.extend(doc.sents)
Xs = get_features(sentences, self.max_length)
ys = self._model.predict(Xs)
for sent, label in zip(sentences, ys):
sent.doc.sentiment += label - 0.5
for doc in minibatch:
yield doc
def set_sentiment(self, doc, y):
doc.sentiment = float(y[0])
# Sentiment has a native slot for a single float.
# For arbitrary data storage, there's:
# doc.user_data['my_data'] = y
class Classifier(Chain):
def __init__(self, predictor):
super(Classifier, self).__init__(predictor=predictor)
def __call__(self, x, t):
y = self.predictor(x)
loss = F.softmax_cross_entropy(y, t)
accuracy = F.accuracy(y, t)
report({'loss': loss, 'accuracy': accuracy}, self)
return loss
class SentimentModel(Chain):
def __init__(self, nlp, shape, **settings):
Chain.__init__(self,
embed=_Embed(shape['nr_vector'], shape['nr_dim'], shape['nr_hidden'],
set_vectors=lambda arr: set_vectors(arr, nlp.vocab)),
encode=_Encode(shape['nr_hidden'], shape['nr_hidden']),
attend=_Attend(shape['nr_hidden'], shape['nr_hidden']),
predict=_Predict(shape['nr_hidden'], shape['nr_class']))
self.to_gpu(0)
def __call__(self, sentence):
return self.predict(
self.attend(
self.encode(
self.embed(sentence))))
class _Embed(Chain):
def __init__(self, nr_vector, nr_dim, nr_out, set_vectors=None):
Chain.__init__(self,
embed=L.EmbedID(nr_vector, nr_dim, initialW=set_vectors),
project=L.Linear(None, nr_out, nobias=True))
self.embed.W.volatile = False
def __call__(self, sentence):
return [self.project(self.embed(ts)) for ts in F.transpose(sentence)]
class _Encode(Chain):
def __init__(self, nr_in, nr_out):
Chain.__init__(self,
fwd=L.LSTM(nr_in, nr_out),
bwd=L.LSTM(nr_in, nr_out),
mix=L.Bilinear(nr_out, nr_out, nr_out))
def __call__(self, sentence):
self.fwd.reset_state()
fwds = map(self.fwd, sentence)
self.bwd.reset_state()
bwds = reversed(map(self.bwd, reversed(sentence)))
return [F.elu(self.mix(f, b)) for f, b in zip(fwds, bwds)]
class _Attend(Chain):
def __init__(self, nr_in, nr_out):
Chain.__init__(self)
def __call__(self, sentence):
sent = sum(sentence)
return sent
class _Predict(Chain):
def __init__(self, nr_in, nr_out):
Chain.__init__(self,
l1=L.Linear(nr_in, nr_in),
l2=L.Linear(nr_in, nr_out))
def __call__(self, vector):
vector = self.l1(vector)
vector = F.elu(vector)
vector = self.l2(vector)
return vector
class SentenceDataset(TupleDataset):
def __init__(self, nlp, texts, labels, max_length):
self.max_length = max_length
sents, labels = self._get_labelled_sentences(
nlp.pipe(texts, batch_size=5000, n_threads=3),
labels)
TupleDataset.__init__(self,
get_features(sents, max_length),
labels)
def __getitem__(self, index):
batches = [dataset[index] for dataset in self._datasets]
if isinstance(index, slice):
length = len(batches[0])
returns = [tuple([batch[i] for batch in batches])
for i in six.moves.range(length)]
return returns
else:
return tuple(batches)
def _get_labelled_sentences(self, docs, doc_labels):
labels = []
sentences = []
for doc, y in izip(docs, doc_labels):
for sent in doc.sents:
sentences.append(sent)
labels.append(y)
return sentences, xp.asarray(labels, dtype='i')
class DocDataset(TupleDataset):
def __init__(self, nlp, texts, labels):
self.max_length = max_length
DatasetMixin.__init__(self,
get_features(
nlp.pipe(texts, batch_size=5000, n_threads=3), self.max_length),
labels)
def read_data(data_dir, limit=0):
examples = []
for subdir, label in (('pos', 1), ('neg', 0)):
for filename in (data_dir / subdir).iterdir():
with filename.open() as file_:
text = file_.read()
examples.append((text, label))
random.shuffle(examples)
if limit >= 1:
examples = examples[:limit]
return zip(*examples) # Unzips into two lists
def get_features(docs, max_length):
docs = list(docs)
Xs = xp.zeros((len(docs), max_length), dtype='i')
for i, doc in enumerate(docs):
j = 0
for token in doc:
if token.has_vector and not token.is_punct and not token.is_space:
Xs[i, j] = token.norm
j += 1
if j >= max_length:
break
return Xs
def set_vectors(vectors, vocab):
for lex in vocab:
if lex.has_vector and (lex.rank+1) < vectors.shape[0]:
lex.norm = lex.rank+1
vectors[lex.rank + 1] = lex.vector
else:
lex.norm = 0
return vectors
def train(train_texts, train_labels, dev_texts, dev_labels,
lstm_shape, lstm_settings, lstm_optimizer, batch_size=100, nb_epoch=5,
by_sentence=True):
nlp = spacy.load('en', entity=False)
if 'nr_vector' not in lstm_shape:
lstm_shape['nr_vector'] = max(lex.rank+1 for lex in nlp.vocab if lex.has_vector)
if 'nr_dim' not in lstm_shape:
lstm_shape['nr_dim'] = nlp.vocab.vectors_length
print("Make model")
model = Classifier(SentimentModel(nlp, lstm_shape, **lstm_settings))
print("Parsing texts...")
if by_sentence:
train_data = SentenceDataset(nlp, train_texts, train_labels, lstm_shape['max_length'])
dev_data = SentenceDataset(nlp, dev_texts, dev_labels, lstm_shape['max_length'])
else:
train_data = DocDataset(nlp, train_texts, train_labels)
dev_data = DocDataset(nlp, dev_texts, dev_labels)
train_iter = SerialIterator(train_data, batch_size=batch_size,
shuffle=True, repeat=True)
dev_iter = SerialIterator(dev_data, batch_size=batch_size,
shuffle=False, repeat=False)
optimizer = chainer.optimizers.Adam()
optimizer.setup(model)
updater = chainer.training.StandardUpdater(train_iter, optimizer, device=0)
trainer = chainer.training.Trainer(updater, (1, 'epoch'), out='result')
trainer.extend(extensions.Evaluator(dev_iter, model, device=0))
trainer.extend(extensions.LogReport())
trainer.extend(extensions.PrintReport([
'epoch', 'main/accuracy', 'validation/main/accuracy']))
trainer.extend(extensions.ProgressBar())
trainer.run()
def evaluate(model_dir, texts, labels, max_length=100):
def create_pipeline(nlp):
'''
This could be a lambda, but named functions are easier to read in Python.
'''
return [nlp.tagger, nlp.parser, SentimentAnalyser.load(model_dir, nlp,
max_length=max_length)]
nlp = spacy.load('en')
nlp.pipeline = create_pipeline(nlp)
correct = 0
i = 0
for doc in nlp.pipe(texts, batch_size=1000, n_threads=4):
correct += bool(doc.sentiment >= 0.5) == bool(labels[i])
i += 1
return float(correct) / i
@plac.annotations(
train_dir=("Location of training file or directory"),
dev_dir=("Location of development file or directory"),
model_dir=("Location of output model directory",),
is_runtime=("Demonstrate run-time usage", "flag", "r", bool),
nr_hidden=("Number of hidden units", "option", "H", int),
max_length=("Maximum sentence length", "option", "L", int),
dropout=("Dropout", "option", "d", float),
learn_rate=("Learn rate", "option", "e", float),
nb_epoch=("Number of training epochs", "option", "i", int),
batch_size=("Size of minibatches for training LSTM", "option", "b", int),
nr_examples=("Limit to N examples", "option", "n", int)
)
def main(model_dir, train_dir, dev_dir,
is_runtime=False,
nr_hidden=64, max_length=100, # Shape
dropout=0.5, learn_rate=0.001, # General NN config
nb_epoch=5, batch_size=32, nr_examples=-1): # Training params
model_dir = pathlib.Path(model_dir)
train_dir = pathlib.Path(train_dir)
dev_dir = pathlib.Path(dev_dir)
if is_runtime:
dev_texts, dev_labels = read_data(dev_dir)
acc = evaluate(model_dir, dev_texts, dev_labels, max_length=max_length)
print(acc)
else:
print("Read data")
train_texts, train_labels = read_data(train_dir, limit=nr_examples)
dev_texts, dev_labels = read_data(dev_dir, limit=nr_examples)
print("Using GPU 0")
#chainer.cuda.get_device(0).use()
train_labels = xp.asarray(train_labels, dtype='i')
dev_labels = xp.asarray(dev_labels, dtype='i')
lstm = train(train_texts, train_labels, dev_texts, dev_labels,
{'nr_hidden': nr_hidden, 'max_length': max_length, 'nr_class': 2,
'nr_vector': 5000},
{'dropout': 0.5, 'lr': learn_rate},
{},
nb_epoch=nb_epoch, batch_size=batch_size)
if __name__ == '__main__':
#cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof")
#s = pstats.Stats("Profile.prof")
#s.strip_dirs().sort_stats("time").print_stats()
plac.call(main)

View File

@ -20,72 +20,72 @@ The algorithm is O(n) at run-time for document of length n because we're only ev
matching over the tag patterns. So no matter how many phrases we're looking for,
our pattern set stays very small (exact size depends on the maximum length we're
looking for, as the query language currently has no quantifiers)
The example expects a .bz2 file from the Reddit corpus, and a patterns file,
formatted in jsonl as a sequence of entries like this:
{"text":"Anchorage"}
{"text":"Angola"}
{"text":"Ann Arbor"}
{"text":"Annapolis"}
{"text":"Appalachia"}
{"text":"Argentina"}
"""
from __future__ import print_function, unicode_literals, division
from ast import literal_eval
from bz2 import BZ2File
import time
import math
import codecs
import plac
import ujson
from preshed.maps import PreshMap
from preshed.counter import PreshCounter
from spacy.strings import hash_string
from spacy.en import English
from spacy.matcher import PhraseMatcher
import spacy
def read_gazetteer(tokenizer, loc, n=-1):
for i, line in enumerate(open(loc)):
phrase = literal_eval('u' + line.strip())
if ' (' in phrase and phrase.endswith(')'):
phrase = phrase.split(' (', 1)[0]
if i >= n:
break
phrase = tokenizer(phrase)
if all((t.is_lower and t.prob >= -10) for t in phrase):
continue
data = ujson.loads(line.strip())
phrase = tokenizer(data['text'])
for w in phrase:
_ = tokenizer.vocab[w.text]
if len(phrase) >= 2:
yield phrase
def read_text(bz2_loc):
def read_text(bz2_loc, n=10000):
with BZ2File(bz2_loc) as file_:
for line in file_:
yield line.decode('utf8')
for i, line in enumerate(file_):
data = ujson.loads(line)
yield data['body']
if i >= n:
break
def get_matches(tokenizer, phrases, texts, max_length=6):
matcher = PhraseMatcher(tokenizer.vocab, phrases, max_length=max_length)
print("Match")
matcher = PhraseMatcher(tokenizer.vocab, max_length=max_length)
matcher.add('Phrase', None, *phrases)
for text in texts:
doc = tokenizer(text)
for w in doc:
_ = doc.vocab[w.text]
matches = matcher(doc)
for mwe in doc.ents:
yield mwe
for ent_id, start, end in matches:
yield (ent_id, doc[start:end].text)
def main(patterns_loc, text_loc, counts_loc, n=10000000):
nlp = English(parser=False, tagger=False, entity=False)
print("Make matcher")
phrases = read_gazetteer(nlp.tokenizer, patterns_loc, n=n)
counts = PreshCounter()
def main(patterns_loc, text_loc, n=10000):
nlp = spacy.blank('en')
nlp.vocab.lex_attr_getters = {}
phrases = read_gazetteer(nlp.tokenizer, patterns_loc)
count = 0
t1 = time.time()
for mwe in get_matches(nlp.tokenizer, phrases, read_text(text_loc)):
counts.inc(hash_string(mwe.text), 1)
for ent_id, text in get_matches(nlp.tokenizer, phrases, read_text(text_loc, n=n)):
count += 1
t2 = time.time()
print("10m tokens in %d s" % (t2 - t1))
with codecs.open(counts_loc, 'w', 'utf8') as file_:
for phrase in read_gazetteer(nlp.tokenizer, patterns_loc, n=n):
text = phrase.string
key = hash_string(text)
count = counts[key]
if count != 0:
file_.write('%d\t%s\n' % (count, text))
print("%d docs in %.3f s. %d matches" % (n, (t2 - t1), count))
if __name__ == '__main__':
if False:

View File

@ -13,24 +13,29 @@ Input data:
https://www.lt.informatik.tu-darmstadt.de/fileadmin/user_upload/Group_LangTech/data/GermEval2014_complete_data.zip
Developed for: spaCy 1.7.1
Last tested for: spaCy 1.7.1
Last tested for: spaCy 2.0.0a13
'''
from __future__ import unicode_literals, print_function
import plac
from pathlib import Path
import random
import json
import tqdm
from thinc.neural.optimizers import Adam
from thinc.neural.ops import NumpyOps
import spacy.orth as orth_funcs
from spacy.vocab import Vocab
from spacy.pipeline import BeamEntityRecognizer
from spacy.pipeline import EntityRecognizer
from spacy.pipeline import TokenVectorEncoder, NeuralEntityRecognizer
from spacy.tokenizer import Tokenizer
from spacy.tokens import Doc
from spacy.attrs import *
from spacy.gold import GoldParse
from spacy.gold import _iob_to_biluo as iob_to_biluo
from spacy.gold import iob_to_biluo
from spacy.gold import minibatch
from spacy.scorer import Scorer
import spacy.util
try:
unicode
@ -38,96 +43,38 @@ except NameError:
unicode = str
spacy.util.set_env_log(True)
def init_vocab():
return Vocab(
lex_attr_getters={
LOWER: lambda string: string.lower(),
SHAPE: orth_funcs.word_shape,
NORM: lambda string: string.lower(),
PREFIX: lambda string: string[0],
SUFFIX: lambda string: string[-3:],
CLUSTER: lambda string: 0,
IS_ALPHA: orth_funcs.is_alpha,
IS_ASCII: orth_funcs.is_ascii,
IS_DIGIT: lambda string: string.isdigit(),
IS_LOWER: orth_funcs.is_lower,
IS_PUNCT: orth_funcs.is_punct,
IS_SPACE: lambda string: string.isspace(),
IS_TITLE: orth_funcs.is_title,
IS_UPPER: orth_funcs.is_upper,
IS_STOP: lambda string: False,
IS_OOV: lambda string: True
})
def save_vocab(vocab, path):
path = Path(path)
if not path.exists():
path.mkdir()
elif not path.is_dir():
raise IOError("Can't save vocab to %s\nNot a directory" % path)
with (path / 'strings.json').open('w') as file_:
vocab.strings.dump(file_)
vocab.dump((path / 'lexemes.bin').as_posix())
def load_vocab(path):
path = Path(path)
if not path.exists():
raise IOError("Cannot load vocab from %s\nDoes not exist" % path)
if not path.is_dir():
raise IOError("Cannot load vocab from %s\nNot a directory" % path)
return Vocab.load(path)
def init_ner_model(vocab, features=None):
if features is None:
features = tuple(EntityRecognizer.feature_templates)
return EntityRecognizer(vocab, features=features)
def save_ner_model(model, path):
path = Path(path)
if not path.exists():
path.mkdir()
if not path.is_dir():
raise IOError("Can't save model to %s\nNot a directory" % path)
model.model.dump((path / 'model').as_posix())
with (path / 'config.json').open('w') as file_:
data = json.dumps(model.cfg)
if not isinstance(data, unicode):
data = data.decode('utf8')
file_.write(data)
def load_ner_model(vocab, path):
return EntityRecognizer.load(path, vocab)
class Pipeline(object):
@classmethod
def load(cls, path):
path = Path(path)
if not path.exists():
raise IOError("Cannot load pipeline from %s\nDoes not exist" % path)
if not path.is_dir():
raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
vocab = load_vocab(path)
tokenizer = Tokenizer(vocab, {}, None, None, None)
ner_model = load_ner_model(vocab, path / 'ner')
return cls(vocab, tokenizer, ner_model)
def __init__(self, vocab=None, tokenizer=None, entity=None):
if vocab is None:
vocab = init_vocab()
if tokenizer is None:
tokenizer = Tokenizer(vocab, {}, None, None, None)
if entity is None:
entity = init_ner_model(self.vocab)
entity = NeuralEntityRecognizer(vocab)
self.vocab = vocab
self.tokenizer = tokenizer
self.entity = entity
self.pipeline = [self.entity]
def begin_training(self):
for model in self.pipeline:
model.begin_training([])
optimizer = Adam(NumpyOps(), 0.001)
return optimizer
def __call__(self, input_):
doc = self.make_doc(input_)
for process in self.pipeline:
@ -147,14 +94,16 @@ class Pipeline(object):
gold = GoldParse(doc, entities=annotations)
return gold
def update(self, input_, annot):
doc = self.make_doc(input_)
gold = self.make_gold(input_, annot)
for ner in gold.ner:
if ner not in (None, '-', 'O'):
action, label = ner.split('-', 1)
self.entity.add_label(label)
return self.entity.update(doc, gold)
def update(self, inputs, annots, sgd, losses=None, drop=0.):
if losses is None:
losses = {}
docs = [self.make_doc(input_) for input_ in inputs]
golds = [self.make_gold(input_, annot) for input_, annot in
zip(inputs, annots)]
self.entity.update(docs, golds, drop=drop,
sgd=sgd, losses=losses)
return losses
def evaluate(self, examples):
scorer = Scorer()
@ -164,34 +113,36 @@ class Pipeline(object):
scorer.score(doc, gold)
return scorer.scores
def average_weights(self):
self.entity.model.end_training()
def save(self, path):
def to_disk(self, path):
path = Path(path)
if not path.exists():
path.mkdir()
elif not path.is_dir():
raise IOError("Can't save pipeline to %s\nNot a directory" % path)
save_vocab(self.vocab, path / 'vocab')
save_ner_model(self.entity, path / 'ner')
self.vocab.to_disk(path / 'vocab')
self.entity.to_disk(path / 'ner')
def from_disk(self, path):
path = Path(path)
if not path.exists():
raise IOError("Cannot load pipeline from %s\nDoes not exist" % path)
if not path.is_dir():
raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
self.vocab = self.vocab.from_disk(path / 'vocab')
self.entity = self.entity.from_disk(path / 'ner')
def train(nlp, train_examples, dev_examples, ctx, nr_epoch=5):
next_epoch = train_examples
def train(nlp, train_examples, dev_examples, nr_epoch=5):
sgd = nlp.begin_training()
print("Iter", "Loss", "P", "R", "F")
for i in range(nr_epoch):
this_epoch = next_epoch
next_epoch = []
loss = 0
for input_, annot in this_epoch:
loss += nlp.update(input_, annot)
if (i+1) < nr_epoch:
next_epoch.append((input_, annot))
random.shuffle(next_epoch)
random.shuffle(train_examples)
losses = {}
for batch in minibatch(tqdm.tqdm(train_examples, leave=False), size=8):
inputs, annots = zip(*batch)
nlp.update(list(inputs), list(annots), sgd, losses=losses)
scores = nlp.evaluate(dev_examples)
report_scores(i, loss, scores)
nlp.average_weights()
report_scores(i, losses['ner'], scores)
scores = nlp.evaluate(dev_examples)
report_scores(channels, i+1, loss, scores)
@ -208,7 +159,8 @@ def read_examples(path):
with path.open() as file_:
sents = file_.read().strip().split('\n\n')
for sent in sents:
if not sent.strip():
sent = sent.strip()
if not sent:
continue
tokens = sent.split('\n')
while tokens and tokens[0].startswith('#'):
@ -217,28 +169,39 @@ def read_examples(path):
iob = []
for token in tokens:
if token.strip():
pieces = token.split()
pieces = token.split('\t')
words.append(pieces[1])
iob.append(pieces[2])
yield words, iob_to_biluo(iob)
def get_labels(examples):
labels = set()
for words, tags in examples:
for tag in tags:
if '-' in tag:
labels.add(tag.split('-')[1])
return sorted(labels)
@plac.annotations(
model_dir=("Path to save the model", "positional", None, Path),
train_loc=("Path to your training data", "positional", None, Path),
dev_loc=("Path to your development data", "positional", None, Path),
)
def main(model_dir=Path('/home/matt/repos/spaCy/spacy/data/de-1.0.0'),
train_loc=None, dev_loc=None, nr_epoch=30):
train_examples = read_examples(train_loc)
def main(model_dir, train_loc, dev_loc, nr_epoch=30):
print(model_dir, train_loc, dev_loc)
train_examples = list(read_examples(train_loc))
dev_examples = read_examples(dev_loc)
nlp = Pipeline.load(model_dir)
nlp = Pipeline()
for label in get_labels(train_examples):
nlp.entity.add_label(label)
print("Add label", label)
train(nlp, train_examples, list(dev_examples), ctx, nr_epoch)
train(nlp, train_examples, list(dev_examples), nr_epoch)
nlp.save(model_dir)
nlp.to_disk(model_dir)
if __name__ == '__main__':
main()
plac.call(main)

View File

@ -25,7 +25,7 @@ For more details, see the documentation:
* Saving and loading models: https://spacy.io/docs/usage/saving-loading
Developed for: spaCy 1.7.6
Last tested for: spaCy 1.7.6
Last updated for: spaCy 2.0.0a13
"""
from __future__ import unicode_literals, print_function
@ -34,55 +34,41 @@ from pathlib import Path
import random
import spacy
from spacy.gold import GoldParse
from spacy.tagger import Tagger
from spacy.gold import GoldParse, minibatch
from spacy.pipeline import NeuralEntityRecognizer
from spacy.pipeline import TokenVectorEncoder
def get_gold_parses(tokenizer, train_data):
'''Shuffle and create GoldParse objects'''
random.shuffle(train_data)
for raw_text, entity_offsets in train_data:
doc = tokenizer(raw_text)
gold = GoldParse(doc, entities=entity_offsets)
yield doc, gold
def train_ner(nlp, train_data, output_dir):
# Add new words to vocab
for raw_text, _ in train_data:
doc = nlp.make_doc(raw_text)
for word in doc:
_ = nlp.vocab[word.orth]
random.seed(0)
# You may need to change the learning rate. It's generally difficult to
# guess what rate you should set, especially when you have limited data.
nlp.entity.model.learn_rate = 0.001
for itn in range(1000):
random.shuffle(train_data)
loss = 0.
for raw_text, entity_offsets in train_data:
gold = GoldParse(doc, entities=entity_offsets)
# By default, the GoldParse class assumes that the entities
# described by offset are complete, and all other words should
# have the tag 'O'. You can tell it to make no assumptions
# about the tag of a word by giving it the tag '-'.
# However, this allows a trivial solution to the current
# learning problem: if words are either 'any tag' or 'ANIMAL',
# the model can learn that all words can be tagged 'ANIMAL'.
#for i in range(len(gold.ner)):
#if not gold.ner[i].endswith('ANIMAL'):
# gold.ner[i] = '-'
doc = nlp.make_doc(raw_text)
nlp.tagger(doc)
# As of 1.9, spaCy's parser now lets you supply a dropout probability
# This might help the model generalize better from only a few
# examples.
loss += nlp.entity.update(doc, gold, drop=0.9)
if loss == 0:
break
# This step averages the model's weights. This may or may not be good for
# your situation --- it's empirical.
nlp.end_training()
if output_dir:
if not output_dir.exists():
output_dir.mkdir()
nlp.save_to_directory(output_dir)
optimizer = nlp.begin_training(lambda: [])
nlp.meta['name'] = 'en_ent_animal'
for itn in range(50):
losses = {}
for batch in minibatch(get_gold_parses(nlp.make_doc, train_data), size=3):
docs, golds = zip(*batch)
nlp.update(docs, golds, losses=losses, sgd=optimizer, update_shared=True,
drop=0.35)
print(losses)
if not output_dir:
return
elif not output_dir.exists():
output_dir.mkdir()
nlp.to_disk(output_dir)
def main(model_name, output_directory=None):
print("Loading initial model", model_name)
nlp = spacy.load(model_name)
print("Creating initial model", model_name)
nlp = spacy.blank(model_name)
if output_directory is not None:
output_directory = Path(output_directory)
@ -91,6 +77,11 @@ def main(model_name, output_directory=None):
"Horses are too tall and they pretend to care about your feelings",
[(0, 6, 'ANIMAL')],
),
(
"Do they bite?",
[],
),
(
"horses are too tall and they pretend to care about your feelings",
[(0, 6, 'ANIMAL')]
@ -109,18 +100,20 @@ def main(model_name, output_directory=None):
)
]
nlp.entity.add_label('ANIMAL')
nlp.pipeline.append(TokenVectorEncoder(nlp.vocab))
nlp.pipeline.append(NeuralEntityRecognizer(nlp.vocab))
nlp.pipeline[-1].add_label('ANIMAL')
train_ner(nlp, train_data, output_directory)
# Test that the entity is recognized
doc = nlp('Do you like horses?')
text = 'Do you like horses?'
print("Ents in 'Do you like horses?':")
doc = nlp(text)
for ent in doc.ents:
print(ent.label_, ent.text)
if output_directory:
print("Loading from", output_directory)
nlp2 = spacy.load('en', path=output_directory)
nlp2.entity.add_label('ANIMAL')
nlp2 = spacy.load(output_directory)
doc2 = nlp2('Do you like horses?')
for ent in doc2.ents:
print(ent.label_, ent.text)

View File

@ -1,3 +1,7 @@
'''Train a multi-label convolutional neural network text classifier,
using the spacy.pipeline.TextCategorizer component. The model is then added
to spacy.pipeline, and predictions are available at `doc.cats`.
'''
from __future__ import unicode_literals
import plac
import random
@ -12,6 +16,11 @@ from spacy.gold import GoldParse, minibatch
from spacy.util import compounding
from spacy.pipeline import TextCategorizer
# TODO: Remove this once we're not supporting models trained with thinc <6.9.0
import thinc.neural._classes.layernorm
thinc.neural._classes.layernorm.set_compat_six_eight(False)
def train_textcat(tokenizer, textcat,
train_texts, train_cats, dev_texts, dev_cats,
@ -24,14 +33,15 @@ def train_textcat(tokenizer, textcat,
train_docs = [tokenizer(text) for text in train_texts]
train_gold = [GoldParse(doc, cats=cats) for doc, cats in
zip(train_docs, train_cats)]
train_data = zip(train_docs, train_gold)
train_data = list(zip(train_docs, train_gold))
batch_sizes = compounding(4., 128., 1.001)
for i in range(n_iter):
losses = {}
train_data = tqdm.tqdm(train_data, leave=False) # Progress bar
for batch in minibatch(train_data, size=batch_sizes):
# Progress bar and minibatching
batches = minibatch(tqdm.tqdm(train_data, leave=False), size=batch_sizes)
for batch in batches:
docs, golds = zip(*batch)
textcat.update((docs, None), golds, sgd=optimizer, drop=0.2,
textcat.update(docs, golds, sgd=optimizer, drop=0.2,
losses=losses)
with textcat.model.use_params(optimizer.averages):
scores = evaluate(tokenizer, textcat, dev_texts, dev_cats)
@ -61,12 +71,13 @@ def evaluate(tokenizer, textcat, texts, cats):
return {'textcat_p': precis, 'textcat_r': recall, 'textcat_f': fscore}
def load_data():
def load_data(limit=0):
# Partition off part of the train data --- avoid running experiments
# against test.
train_data, _ = thinc.extra.datasets.imdb()
random.shuffle(train_data)
train_data = train_data[-limit:]
texts, labels = zip(*train_data)
cats = [(['POSITIVE'] if y else []) for y in labels]
@ -86,7 +97,7 @@ def main(model_loc=None):
textcat = TextCategorizer(tokenizer.vocab, labels=['POSITIVE'])
print("Load IMDB data")
(train_texts, train_cats), (dev_texts, dev_cats) = load_data()
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=1000)
print("Itn.\tLoss\tP\tR\tF")
progress = '{i:d} {loss:.3f} {textcat_p:.3f} {textcat_r:.3f} {textcat_f:.3f}'

View File

@ -0,0 +1,30 @@
'''Load vectors for a language trained using FastText
https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
'''
from __future__ import unicode_literals
import plac
import numpy
import spacy.language
def main(vectors_loc):
nlp = spacy.language.Language()
with open(vectors_loc, 'rb') as file_:
header = file_.readline()
nr_row, nr_dim = header.split()
nlp.vocab.clear_vectors(int(nr_dim))
for line in file_:
line = line.decode('utf8')
pieces = line.split()
word = pieces[0]
vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
nlp.vocab.set_vector(word, vector)
doc = nlp(u'class colspan')
print(doc[0].similarity(doc[1]))
if __name__ == '__main__':
plac.call(main)

5
fabfile.py vendored
View File

@ -14,6 +14,7 @@ VENV_DIR = path.join(PWD, ENV)
def env(lang='python2.7'):
if path.exists(VENV_DIR):
local('rm -rf {env}'.format(env=VENV_DIR))
local('pip install virtualenv')
local('python -m virtualenv -p {lang} {env}'.format(lang=lang, env=VENV_DIR))
@ -32,6 +33,10 @@ def make():
local('pip install -r requirements.txt')
local('python setup.py build_ext --inplace')
def sdist():
with virtualenv(VENV_DIR):
with lcd(path.dirname(__file__)):
local('python setup.py sdist')
def clean():
with lcd(path.dirname(__file__)):

View File

@ -1,9 +1,9 @@
cython<0.24
cython>=0.24,<0.27.0
pathlib
numpy>=1.7
cymem>=1.30,<1.32
preshed>=1.0.0,<2.0.0
thinc>=6.8.0,<6.9.0
thinc>=6.9.0,<6.10.0
murmurhash>=0.28,<0.29
plac<1.0.0,>=0.9.6
six
@ -13,7 +13,7 @@ requests>=2.13.0,<3.0.0
regex==2017.4.5
ftfy>=4.4.2,<5.0.0
pytest>=3.0.6,<4.0.0
pip>=9.0.0,<10.0.0
mock>=2.0.0,<3.0.0
msgpack-python
msgpack-numpy
html5lib==1.0b8

View File

@ -195,9 +195,8 @@ def setup_package():
'murmurhash>=0.28,<0.29',
'cymem>=1.30,<1.32',
'preshed>=1.0.0,<2.0.0',
'thinc>=6.8.0,<6.9.0',
'thinc>=6.9.0,<6.10.0',
'plac<1.0.0,>=0.9.6',
'pip>=9.0.0,<10.0.0',
'six',
'pathlib',
'ujson>=1.35',

View File

@ -4,11 +4,13 @@ from __future__ import unicode_literals
from .cli.info import info as cli_info
from .glossary import explain
from .deprecated import resolve_load_name
#from .about import __version__
from .about import __version__
from . import util
def load(name, **overrides):
from .deprecated import resolve_load_name
name = resolve_load_name(name, **overrides)
return util.load_model(name, **overrides)

View File

@ -7,7 +7,7 @@ if __name__ == '__main__':
import plac
import sys
from spacy.cli import download, link, info, package, train, convert, model
from spacy.cli import profile
from spacy.cli import profile, evaluate
from spacy.util import prints
commands = {
@ -15,6 +15,7 @@ if __name__ == '__main__':
'link': link,
'info': info,
'train': train,
'evaluate': evaluate,
'convert': convert,
'package': package,
'model': model,

View File

@ -1,28 +1,27 @@
import ujson
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
from thinc.i2v import HashEmbed, StaticVectors
from thinc.t2t import ExtractWindow, ParametricAttention
from thinc.t2v import Pooling, max_pool, mean_pool, sum_pool
from thinc.misc import Residual
from thinc.misc import BatchNorm as BN
from thinc.misc import LayerNorm as LN
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
from thinc.neural import Model, Maxout, Softmax, Affine
from thinc.neural._classes.hash_embed import HashEmbed
from thinc.api import FeatureExtracter, with_getitem
from thinc.api import uniqued, wrap, flatten_add_lengths, noop
from thinc.linear.linear import LinearModel
from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.util import get_array_module
import random
import cytoolz
from thinc.neural._classes.convolution import ExtractWindow
from thinc.neural._classes.static_vectors import StaticVectors
from thinc.neural._classes.batchnorm import BatchNorm as BN
from thinc.neural._classes.layernorm import LayerNorm as LN
from thinc.neural._classes.resnet import Residual
from thinc.neural import ReLu
from thinc.neural._classes.selu import SELU
from thinc import describe
from thinc.describe import Dimension, Synapses, Biases, Gradient
from thinc.neural._classes.affine import _set_dimensions_if_needed
from thinc.api import FeatureExtracter, with_getitem
from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool
from thinc.neural._classes.attention import ParametricAttention
from thinc.linear.linear import LinearModel
from thinc.api import uniqued, wrap, flatten_add_lengths
import thinc.extra.load_nlp
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP, CLUSTER
from .tokens.doc import Doc
@ -31,6 +30,11 @@ from . import util
import numpy
import io
# TODO: Unset this once we don't want to support models previous models.
import thinc.neural._classes.layernorm
thinc.neural._classes.layernorm.set_compat_six_eight(True)
VECTORS_KEY = 'spacy_pretrained_vectors'
@layerize
def _flatten_add_lengths(seqs, pad=0, drop=0.):
@ -225,33 +229,80 @@ def drop_layer(layer, factor=2.):
model.predict = layer
return model
def link_vectors_to_models(vocab):
vectors = vocab.vectors
ops = Model.ops
for word in vocab:
if word.orth in vectors.key2row:
word.rank = vectors.key2row[word.orth]
else:
word.rank = 0
data = ops.asarray(vectors.data)
# Set an entry here, so that vectors are accessed by StaticVectors
# (unideal, I know)
thinc.extra.load_nlp.VECTORS[(ops.device, VECTORS_KEY)] = data
def Tok2Vec(width, embed_size, preprocess=None):
def Tok2Vec(width, embed_size, **kwargs):
pretrained_dims = kwargs.get('pretrained_dims', 0)
cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 3)
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add,
'*': reapply}):
norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm')
prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix')
suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix')
shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape')
if pretrained_dims is not None and pretrained_dims >= 1:
glove = StaticVectors(VECTORS_KEY, width, column=cols.index(ID))
embed = (norm | prefix | suffix | shape ) >> LN(Maxout(width, width*4, pieces=3))
tok2vec = (
with_flatten(
asarray(Model.ops, dtype='uint64')
>> uniqued(embed, column=5)
>> Residual(
(ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
) ** 4, pad=4
)
embed = uniqued(
(glove | norm | prefix | suffix | shape)
>> LN(Maxout(width, width*5, pieces=3)), column=5)
else:
embed = uniqued(
(norm | prefix | suffix | shape)
>> LN(Maxout(width, width*4, pieces=3)), column=5)
convolution = Residual(
ExtractWindow(nW=1)
>> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces))
)
if preprocess not in (False, None):
tok2vec = preprocess >> tok2vec
tok2vec = (
FeatureExtracter(cols)
>> with_flatten(
embed >> (convolution ** 4), pad=4)
)
# Work around thinc API limitations :(. TODO: Revise in Thinc 7
tok2vec.nO = width
tok2vec.embed = embed
return tok2vec
def reapply(layer, n_times):
def reapply_fwd(X, drop=0.):
backprops = []
for i in range(n_times):
Y, backprop = layer.begin_update(X, drop=drop)
X = Y
backprops.append(backprop)
def reapply_bwd(dY, sgd=None):
dX = None
for backprop in reversed(backprops):
dY = backprop(dY, sgd=sgd)
if dX is None:
dX = dY
else:
dX += dY
return dX
return Y, reapply_bwd
return wrap(reapply_fwd, layer)
def asarray(ops, dtype):
def forward(X, drop=0.):
return ops.asarray(X, dtype=dtype), None
@ -455,20 +506,25 @@ def getitem(i):
return X[i], None
return layerize(getitem_fwd)
def build_tagger_model(nr_class, token_vector_width, **cfg):
embed_size = util.env_opt('embed_size', 7500)
def build_tagger_model(nr_class, **cfg):
embed_size = util.env_opt('embed_size', 7000)
if 'token_vector_width' in cfg:
token_vector_width = cfg['token_vector_width']
else:
token_vector_width = util.env_opt('token_vector_width', 128)
pretrained_dims = cfg.get('pretrained_dims', 0)
with Model.define_operators({'>>': chain, '+': add}):
# Input: (doc, tensor) tuples
private_tok2vec = Tok2Vec(token_vector_width, embed_size, preprocess=doc2feats())
if 'tok2vec' in cfg:
tok2vec = cfg['tok2vec']
else:
tok2vec = Tok2Vec(token_vector_width, embed_size,
pretrained_dims=pretrained_dims)
model = (
fine_tune(private_tok2vec)
>> with_flatten(
Maxout(token_vector_width, token_vector_width)
>> Softmax(nr_class, token_vector_width)
)
tok2vec
>> with_flatten(Softmax(nr_class, token_vector_width))
)
model.nI = None
model.tok2vec = tok2vec
return model
@ -514,6 +570,7 @@ def foreach(layer, drop_factor=1.0):
def build_text_classifier(nr_class, width=64, **cfg):
nr_vector = cfg.get('nr_vector', 5000)
pretrained_dims = cfg.get('pretrained_dims', 0)
with Model.define_operators({'>>': chain, '+': add, '|': concatenate,
'**': clone}):
if cfg.get('low_data'):
@ -521,7 +578,7 @@ def build_text_classifier(nr_class, width=64, **cfg):
SpacyVectors
>> flatten_add_lengths
>> with_getitem(0,
Affine(width, 300)
Affine(width, pretrained_dims)
)
>> ParametricAttention(width)
>> Pooling(sum_pool)
@ -548,18 +605,24 @@ def build_text_classifier(nr_class, width=64, **cfg):
)
)
static_vectors = (
SpacyVectors
>> with_flatten(Affine(width, 300))
)
cnn_model = (
if pretrained_dims:
static_vectors = (
SpacyVectors
>> with_flatten(Affine(width, pretrained_dims))
)
# TODO Make concatenate support lists
concatenate_lists(trained_vectors, static_vectors)
vectors = concatenate_lists(trained_vectors, static_vectors)
vectors_width = width*2
else:
vectors = trained_vectors
vectors_width = width
static_vectors = None
cnn_model = (
vectors
>> with_flatten(
LN(Maxout(width, width*2))
LN(Maxout(width, vectors_width))
>> Residual(
(ExtractWindow(nW=1) >> zero_init(Maxout(width, width*3)))
(ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
) ** 2, pad=2
)
>> flatten_add_lengths
@ -579,7 +642,7 @@ def build_text_classifier(nr_class, width=64, **cfg):
>> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0))
>> logistic
)
model.nO = nr_class
model.lsuv = False
return model

View File

@ -3,14 +3,15 @@
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
__title__ = 'spacy-nightly'
__version__ = '2.0.0a13'
__version__ = '2.0.0a16'
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
__uri__ = 'https://spacy.io'
__author__ = 'Explosion AI'
__email__ = 'contact@explosion.ai'
__license__ = 'MIT'
__release__ = True
__docs_models__ = 'https://spacy.io/docs/usage/models'
__docs_models__ = 'https://alpha.spacy.io/usage/models'
__download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
__compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json'
__shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts.json'

View File

@ -1,5 +1,5 @@
# Reserve 64 values for flag features
cpdef enum attr_id_t:
cdef enum attr_id_t:
NULL_ATTR
IS_ALPHA
IS_ASCII

View File

@ -94,6 +94,7 @@ IDS = {
# ATTR IDs, in order of the symbol
NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
locals().update(IDS)
def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):

View File

@ -4,5 +4,6 @@ from .link import link
from .package import package
from .profile import profile
from .train import train
from .evaluate import evaluate
from .convert import convert
from .model import model

View File

@ -14,7 +14,7 @@ from ..util import prints
CONVERTERS = {
'.conllu': conllu2json,
'.conll': conllu2json,
'.iob': iob2json
'.iob': iob2json,
}

View File

@ -1,5 +1,6 @@
# coding: utf8
from __future__ import unicode_literals
from cytoolz import partition_all, concat
from ...compat import json_dumps, path2str
from ...util import prints
@ -10,11 +11,9 @@ def iob2json(input_path, output_path, n_sents=10, *a, **k):
"""
Convert IOB files into JSON format for use with train cli.
"""
# TODO: This isn't complete yet -- need to map from IOB to
# BILUO
with input_path.open('r', encoding='utf8') as file_:
docs = read_iob(file_)
sentences = read_iob(file_)
docs = merge_sentences(sentences, n_sents)
output_filename = input_path.parts[-1].replace(".iob", ".json")
output_file = output_path / output_filename
with output_file.open('w', encoding='utf-8') as f:
@ -23,9 +22,9 @@ def iob2json(input_path, output_path, n_sents=10, *a, **k):
title="Generated output file %s" % path2str(output_file))
def read_iob(file_):
def read_iob(raw_sents):
sentences = []
for line in file_:
for line in raw_sents:
if not line.strip():
continue
tokens = [t.split('|') for t in line.split()]
@ -43,3 +42,15 @@ def read_iob(file_):
paragraphs = [{'sentences': [sent]} for sent in sentences]
docs = [{'id': 0, 'paragraphs': [para]} for para in paragraphs]
return docs
def merge_sentences(docs, n_sents):
counter = 0
merged = []
for group in partition_all(n_sents, docs):
group = list(group)
first = group.pop(0)
to_extend = first['paragraphs'][0]['sentences']
for sent in group[1:]:
to_extend.extend(sent['paragraphs'][0]['sentences'])
merged.append(first)
return merged

119
spacy/cli/evaluate.py Normal file
View File

@ -0,0 +1,119 @@
# coding: utf8
from __future__ import unicode_literals, division, print_function
import plac
import json
from collections import defaultdict
import cytoolz
from pathlib import Path
import dill
import tqdm
from thinc.neural._classes.model import Model
from thinc.neural.optimizers import linear_decay
from timeit import default_timer as timer
import random
import numpy.random
from ..tokens.doc import Doc
from ..scorer import Scorer
from ..gold import GoldParse, merge_sents
from ..gold import GoldCorpus, minibatch
from ..util import prints
from .. import util
from .. import about
from .. import displacy
from ..compat import json_dumps
random.seed(0)
numpy.random.seed(0)
@plac.annotations(
model=("Model name or path", "positional", None, str),
data_path=("Location of JSON-formatted evaluation data", "positional", None, str),
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
gpu_id=("Use GPU", "option", "g", int),
displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str),
displacy_limit=("Limit of parses to render as HTML", "option", "dl", int)
)
def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
displacy_path=None, displacy_limit=25):
"""
Evaluate a model. To render a sample of parses in a HTML file, set an output
directory as the displacy_path argument.
"""
util.use_gpu(gpu_id)
util.set_env_log(False)
data_path = util.ensure_path(data_path)
displacy_path = util.ensure_path(displacy_path)
if not data_path.exists():
prints(data_path, title="Evaluation data not found", exits=1)
if displacy_path and not displacy_path.exists():
prints(displacy_path, title="Visualization output directory not found", exits=1)
corpus = GoldCorpus(data_path, data_path)
nlp = util.load_model(model)
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
begin = timer()
scorer = nlp.evaluate(dev_docs, verbose=False)
end = timer()
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
print_results(scorer, time=end - begin, words=nwords,
wps=nwords / (end - begin))
if displacy_path:
docs, golds = zip(*dev_docs)
render_deps = 'parser' in nlp.meta.get('pipeline', [])
render_ents = 'ner' in nlp.meta.get('pipeline', [])
render_parses(docs, displacy_path, model_name=model, limit=displacy_limit,
deps=render_deps, ents=render_ents)
prints(displacy_path, title="Generated %s parses as HTML" % displacy_limit)
def render_parses(docs, output_path, model_name='', limit=250, deps=True, ents=True):
docs[0].user_data['title'] = model_name
if ents:
with (output_path / 'entities.html').open('w') as file_:
html = displacy.render(docs[:limit], style='ent', page=True)
file_.write(html)
if deps:
with (output_path / 'parses.html').open('w') as file_:
html = displacy.render(docs[:limit], style='dep', page=True, options={'compact': True})
file_.write(html)
def print_progress(itn, losses, dev_scores, wps=0.0):
scores = {}
for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
'ents_p', 'ents_r', 'ents_f', 'wps']:
scores[col] = 0.0
scores['dep_loss'] = losses.get('parser', 0.0)
scores['ner_loss'] = losses.get('ner', 0.0)
scores['tag_loss'] = losses.get('tagger', 0.0)
scores.update(dev_scores)
scores['wps'] = wps
tpl = '\t'.join((
'{:d}',
'{dep_loss:.3f}',
'{ner_loss:.3f}',
'{uas:.3f}',
'{ents_p:.3f}',
'{ents_r:.3f}',
'{ents_f:.3f}',
'{tags_acc:.3f}',
'{token_acc:.3f}',
'{wps:.1f}'))
print(tpl.format(itn, **scores))
def print_results(scorer, time, words, wps):
results = {
'Time': '%.2f s' % time,
'Words': words,
'Words/s': '%.0f' % wps,
'TOK': '%.2f' % scorer.token_acc,
'POS': '%.2f' % scorer.tags_acc,
'UAS': '%.2f' % scorer.uas,
'LAS': '%.2f' % scorer.las,
'NER P': '%.2f' % scorer.ents_p,
'NER R': '%.2f' % scorer.ents_r,
'NER F': '%.2f' % scorer.ents_f}
util.print_table(results, title="Results")

View File

@ -105,8 +105,11 @@ def generate_pipeline():
"parser, ner. For more information, see the docs on processing pipelines.",
title="Enter your model's pipeline components")
pipeline = util.get_raw_input("Pipeline components", True)
replace = {'True': True, 'False': False}
return replace[pipeline] if pipeline in replace else pipeline.split(', ')
subs = {'True': True, 'False': False}
if pipeline in subs:
return subs[pipeline]
else:
return [p.strip() for p in pipeline.split(',')]
def validate_meta(meta, keys):

View File

@ -8,8 +8,11 @@ import cytoolz
from pathlib import Path
import dill
import tqdm
from thinc.neural._classes.model import Model
from thinc.neural.optimizers import linear_decay
from timeit import default_timer as timer
import random
import numpy.random
from ..tokens.doc import Doc
from ..scorer import Scorer
@ -17,9 +20,13 @@ from ..gold import GoldParse, merge_sents
from ..gold import GoldCorpus, minibatch
from ..util import prints
from .. import util
from .. import about
from .. import displacy
from ..compat import json_dumps
random.seed(0)
numpy.random.seed(0)
@plac.annotations(
lang=("model language", "positional", None, str),
@ -29,15 +36,17 @@ from ..compat import json_dumps
n_iter=("number of iterations", "option", "n", int),
n_sents=("number of sentences", "option", "ns", int),
use_gpu=("Use GPU", "option", "g", int),
resume=("Whether to resume training", "flag", "R", bool),
vectors=("Model to load vectors from", "option", "v"),
no_tagger=("Don't train tagger", "flag", "T", bool),
no_parser=("Don't train parser", "flag", "P", bool),
no_entities=("Don't train NER", "flag", "N", bool),
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
version=("Model version", "option", "V", str),
meta_path=("Optional path to meta.json. All relevant properties will be overwritten.", "option", "m", Path)
)
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False,
gold_preproc=False):
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False,
gold_preproc=False, version="0.0.0", meta_path=None):
"""
Train a model. Expects data in spaCy's JSON format.
"""
@ -46,19 +55,24 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
output_path = util.ensure_path(output_dir)
train_path = util.ensure_path(train_data)
dev_path = util.ensure_path(dev_data)
meta_path = util.ensure_path(meta_path)
if not output_path.exists():
output_path.mkdir()
if not train_path.exists():
prints(train_path, title="Training data not found", exits=1)
if dev_path and not dev_path.exists():
prints(dev_path, title="Development data not found", exits=1)
if meta_path is not None and not meta_path.exists():
prints(meta_path, title="meta.json not found", exits=1)
meta = util.read_json(meta_path) if meta_path else {}
if not isinstance(meta, dict):
prints("Expected dict but got: {}".format(type(meta)),
title="Not a valid meta.json format", exits=1)
lang_class = util.get_lang_class(lang)
pipeline = ['token_vectors', 'tags', 'dependencies', 'entities']
if no_tagger and 'tags' in pipeline: pipeline.remove('tags')
if no_parser and 'dependencies' in pipeline: pipeline.remove('dependencies')
if no_entities and 'entities' in pipeline: pipeline.remove('entities')
pipeline = ['tagger', 'parser', 'ner']
if no_tagger and 'tagger' in pipeline: pipeline.remove('tagger')
if no_parser and 'parser' in pipeline: pipeline.remove('parser')
if no_entities and 'ner' in pipeline: pipeline.remove('ner')
# Take dropout and batch size as generators of values -- dropout
# starts high and decays sharply, to force the optimizer to explore.
@ -68,33 +82,30 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
util.env_opt('dropout_to', 0.2),
util.env_opt('dropout_decay', 0.0))
batch_sizes = util.compounding(util.env_opt('batch_from', 1),
util.env_opt('batch_to', 64),
util.env_opt('batch_to', 16),
util.env_opt('batch_compound', 1.001))
if resume:
prints(output_path / 'model9.pickle', title="Resuming training")
nlp = dill.load((output_path / 'model9.pickle').open('rb'))
else:
nlp = lang_class(pipeline=pipeline)
corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
n_train_words = corpus.count_train()
lang_class = util.get_lang_class(lang)
nlp = lang_class(pipeline=pipeline)
if vectors:
util.load_model(vectors, vocab=nlp.vocab)
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
nlp._optimizer = None
print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
print("Itn.\tP.Loss\tN.Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
try:
train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0,
gold_preproc=gold_preproc, max_length=0)
train_docs = list(train_docs)
for i in range(n_iter):
if resume:
i += 20
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0,
gold_preproc=gold_preproc, max_length=0)
losses = {}
for batch in minibatch(train_docs, size=batch_sizes):
docs, golds = zip(*batch)
nlp.update(docs, golds, sgd=optimizer,
drop=next(dropout_rates), losses=losses,
update_shared=True)
drop=next(dropout_rates), losses=losses)
pbar.update(sum(len(doc) for doc in docs))
with nlp.use_params(optimizer.averages):
@ -104,12 +115,22 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
nlp_loaded = lang_class(pipeline=pipeline)
nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
scorer = nlp_loaded.evaluate(
corpus.dev_docs(
list(corpus.dev_docs(
nlp_loaded,
gold_preproc=gold_preproc))
gold_preproc=gold_preproc)))
acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
with acc_loc.open('w') as file_:
file_.write(json_dumps(scorer.scores))
meta_loc = output_path / ('model%d' % i) / 'meta.json'
meta['accuracy'] = scorer.scores
meta['lang'] = nlp.lang
meta['pipeline'] = pipeline
meta['spacy_version'] = '>=%s' % about.__version__
meta.setdefault('name', 'model%d' % i)
meta.setdefault('version', version)
with meta_loc.open('w') as file_:
file_.write(json_dumps(meta))
util.set_env_log(True)
print_progress(i, losses, scorer.scores)
finally:
@ -138,12 +159,14 @@ def print_progress(itn, losses, dev_scores, wps=0.0):
'ents_p', 'ents_r', 'ents_f', 'wps']:
scores[col] = 0.0
scores['dep_loss'] = losses.get('parser', 0.0)
scores['ner_loss'] = losses.get('ner', 0.0)
scores['tag_loss'] = losses.get('tagger', 0.0)
scores.update(dev_scores)
scores['wps'] = wps
tpl = '\t'.join((
'{:d}',
'{dep_loss:.3f}',
'{ner_loss:.3f}',
'{uas:.3f}',
'{ents_p:.3f}',
'{ents_r:.3f}',

View File

@ -7,6 +7,7 @@ import re
import ujson
import random
import cytoolz
import itertools
from .syntax import nonproj
from .util import ensure_path
@ -146,9 +147,13 @@ def minibatch(items, size=8):
'''Iterate over batches of items. `size` may be an iterator,
so that batch-size can vary on each step.
'''
if isinstance(size, int):
size_ = itertools.repeat(8)
else:
size_ = size
items = iter(items)
while True:
batch_size = next(size) #if hasattr(size, '__next__') else size
batch_size = next(size_)
batch = list(cytoolz.take(int(batch_size), items))
if len(batch) == 0:
break

View File

@ -29,9 +29,9 @@ _units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm
'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb '
'TB T G M K %')
_currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
_punct = r', : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &'
_punct = r'…… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* & ·'
_quotes = r'\' \'\' " ” “ `` ` ´ , „ » «'
_hyphens = '- — -- ---'
_hyphens = '- — -- --- —— ~'
_other_symbols = r'[\p{So}]'
UNITS = merge_chars(_units)

View File

@ -3,6 +3,7 @@ from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .norm_exceptions import NORM_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from .lemmatizer import LOOKUP
@ -23,6 +24,7 @@ class GermanDefaults(Language.Defaults):
NORM_EXCEPTIONS, BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
infixes = tuple(TOKENIZER_INFIXES)
tag_map = dict(TAG_MAP)
stop_words = set(STOP_WORDS)
syntax_iterators = dict(SYNTAX_ITERATORS)

View File

@ -0,0 +1,20 @@
# coding: utf8
from __future__ import unicode_literals
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
from ..char_classes import QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
_quotes = QUOTES.replace("'", '')
_infixes = (LIST_ELLIPSES + LIST_ICONS +
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}])([{q}\)\]\(\[])(?=[\{a}])'.format(a=ALPHA, q=_quotes),
r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA),
r'(?<=[0-9])-(?=[0-9])'])
TOKENIZER_INFIXES = _infixes

View File

@ -4,6 +4,7 @@ from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .lemmatizer import LOOKUP
from .syntax_iterators import SYNTAX_ITERATORS
@ -17,6 +18,7 @@ from ...util import update_exc, add_lookups
class FrenchDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: 'fr'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)

View File

@ -0,0 +1,41 @@
# coding: utf8
from __future__ import unicode_literals
from ...attrs import LIKE_NUM
_num_words = set("""
zero un deux trois quatre cinq six sept huit neuf dix
onze douze treize quatorze quinze seize dix-sept dix-huit dix-neuf
vingt trente quanrante cinquante soixante septante quatre-vingt huitante nonante
cent mille mil million milliard billion quadrillion quintillion
sextillion septillion octillion nonillion decillion
""".split())
_ordinal_words = set("""
premier deuxième second troisième quatrième cinquième sixième septième huitième neuvième dixième
onzième douzième treizième quatorzième quinzième seizième dix-septième dix-huitième dix-neufième
vingtième trentième quanrantième cinquantième soixantième septantième quatre-vingtième huitantième nonantième
centième millième millionnième milliardième billionnième quadrillionnième quintillionnième
sextillionnième septillionnième octillionnième nonillionnième decillionnième
""".split())
def like_num(text):
# Might require more work?
# See this discussion: https://github.com/explosion/spaCy/pull/1161
text = text.replace(',', '').replace('.', '')
if text.isdigit():
return True
if text.count('/') == 1:
num, denom = text.split('/')
if num.isdigit() and denom.isdigit():
return True
if text in _num_words:
return True
return False
LEX_ATTRS = {
LIKE_NUM: like_num
}

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
@ -12,6 +13,7 @@ from ...util import update_exc, add_lookups
class DutchDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: 'nl'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)

View File

@ -0,0 +1,40 @@
# coding: utf8
from __future__ import unicode_literals
from ...attrs import LIKE_NUM
_num_words = set("""
nul een één twee drie vier vijf zes zeven acht negen tien elf twaalf dertien
veertien twintig dertig veertig vijftig zestig zeventig tachtig negentig honderd
duizend miljoen miljard biljoen biljard triljoen triljard
""".split())
_ordinal_words = set("""
eerste tweede derde vierde vijfde zesde zevende achtste negende tiende elfde
twaalfde dertiende veertiende twintigste dertigste veertigste vijftigste
zestigste zeventigste tachtigste negentigste honderdste duizendste miljoenste
miljardste biljoenste biljardste triljoenste triljardste
""".split())
def like_num(text):
# This only does the most basic check for whether a token is a digit
# or matches one of the number words. In order to handle numbers like
# "drieëntwintig", more work is required.
# See this discussion: https://github.com/explosion/spaCy/pull/1177
text = text.replace(',', '').replace('.', '')
if text.isdigit():
return True
if text.count('/') == 1:
num, denom = text.split('/')
if num.isdigit() and denom.isdigit():
return True
if text in _num_words:
return True
return False
LEX_ATTRS = {
LIKE_NUM: like_num
}

35
spacy/lang/th/__init__.py Normal file
View File

@ -0,0 +1,35 @@
# coding: utf8
from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...tokens import Doc
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
class ThaiDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'th'
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
tag_map = dict(TAG_MAP)
stop_words = set(STOP_WORDS)
class Thai(Language):
lang = 'th'
Defaults = ThaiDefaults
def make_doc(self, text):
try:
from pythainlp.tokenize import word_tokenize
except ImportError:
raise ImportError("The Thai tokenizer requires the PyThaiNLP library: "
"https://github.com/wannaphongcom/pythainlp/")
words = [x for x in list(word_tokenize(text,"newmm"))]
return Doc(self.vocab, words=words, spaces=[False]*len(words))
__all__ = ['Thai']

View File

@ -0,0 +1,62 @@
# encoding: utf8
from __future__ import unicode_literals
# data from https://github.com/wannaphongcom/pythainlp/blob/dev/pythainlp/corpus/stopwords-th.txt
# stop words as whitespace-separated list
STOP_WORDS = set("""
นอกจาก าให ทาง งน วง จาก จะ ความ คร คง ของ
ขอ ระหวาง รวม มาก มา พรอม พบ าน ผล บาง เปดเผย เป เนองจาก เดยวก เดยว เช เฉพาะ เข
อง างๆ าง ตาม งแต าน วย อาจ ออก อยาง อะไร อย อยาก หาก หลาย หลงจาก แต เอง เห
เลย เร เรา เม เพ เพราะ เปนการ เป หล หร หน วน าหร ลง วม ราย ขณะ อน การ
กว กลาว ไว ไป ได ให ใน โดย แห แล และ แรก แบบ เขา เคย ไม อยาก เก เกนๆ เกยวก เกยวก
เกยวของ เกยวเนอง เกยวๆ เกอบ เกอบจะ เกอบๆ แก แก แกไข ใกล ใกล ไกล ไกลๆ ขณะเดยวก ขณะใด ขณะใดๆ ขณะท ขณะน ขณะน ขณะหน ขวาง
ขวางๆ ใคร ใคร ใครจะ ใครๆ าย ายๆ ไง จง จด จน จนกระท จนกว จนขณะน จนตลอด จนถ จนท จนบดน จนเม จนแม จนแม
จรด จรดก จร จรงจ จรงๆ จรงๆจงๆ จวน จวนจะ จวนเจยน จวบ งก งก งก งกนและก งไดแก งๆ วย วยก วยเชนก วยท วยประการฉะน
วยเพราะ วยว วยเหต วยเหต วยเหต วยเหตเพราะ วยเหต วยเหมอนก งกลาว งก งก งกบว งกบว งเก
งเก งเคย ใดๆ ได ไดแก ไดแต ได ไดมา ได ตน ตนเอง ตนฯ ตรง ตรงๆ ตลอด ตลอดกาล ตลอดกาลนาน ตลอดจน ตลอดถ ตลอดท
ตลอดท ตลอดทวถ ตลอดทวท ตลอดป ตลอดไป ตลอดมา ตลอดระยะเวลา ตลอดว ตลอดเวลา ตลอดศก อก งแก งจะ งบดน งบดน
งเม งเมอใด งเมอไร งแม งแมจะ งแม งอยางไร อว กตอง กๆ เถอะ เถ ทรง ทว งคน งต งท งท งน งนนดวย งนนเพราะ
นอก นอกจากท นอกจากน นอกจากน นอกจากว นอกน นอกเหน นอกเหนอจาก อย อยกว อยๆ นะ กๆ นไง นเป นแหละ
นเอง นๆ บจากน บจากน บตงแต บแต บแต บแต เปนต เปนตนไป เปนตนมา เปนแต เปนแตเพยง เปนท เปนท เปนท เปนเพราะ
เปนเพราะว เปนเพยง เปนเพยงว เปนเพ เปนอ เปนอนมาก เปนอนว เปนอนๆ เปนอาท เปนๆ เปลยน เปลยนแปลง เป เปดเผย ไป าน านๆ
ดๆ เพยงเพ เพยงไร เพยงไหน เพอท เพอทจะ เพอว เพอให ภาค ภาคฯ ภาย ภายใต ภายนอก ภายใน ภายภาค ภายภาคหน ภายหน ภายหล
มอง มองว กจะ นๆ ยนะ ยน ยเน ยล นนาน นยง นย นยาว เยอะ เยอะแยะ เยอะๆ แยะ แยะๆ รวด รวดเร วม รวมก วมก
รวมดวย วมดวย รวมถ รวมท วมม รวมๆ ระยะ ระยะๆ ระหวาง บรอง อว นกาลนาน บเนอง ดๆ งกว งส งส งๆ เสมอนก
เสมอนว เสร เสรจก เสรจแล เสรจสมบรณ เสรจส เส เสยกอน เสยจน เสยจนกระท เสยจนถ เสยดวย เสยน เสยนนเอง เสยน เสยนกระไร เสยย
เสยยงน เสยแล ใหญ ให ใหแด ใหไป ใหม ใหมา ใหม ไหน ไหนๆ อด อน อยาง อยางเช อยางด อยางเดยว อยางใด อยางท อยางนอย อยางน
อยางน อยางโน แค จะ ได อเม ตาม ตามแต ตามท แลวแต กระท กระทำ กระน กระผม กล กลาวค กล กลมกอน
กลมๆ กวาง กวางขวาง กวางๆ อนหน อนหนาน อนๆ นดกว นดไหม นเถอะ นนะ นและก นไหม นเอง กำล กำลงจะ กำหนด เก
เก เกยวของ แก แกไข ใกล ใกล าง างเคยง างต างบน างลาง างๆ ขาด าพเจ าฯ เขาใจ เขยน คงจะ คงอย ครบ ครบคร ครบถวน
ครงกระน ครงกอน ครงครา ครงคราว ครงใด ครงท ครงน ครงน ครงละ ครงหน ครงหล ครงหลงส ครงไหน ครงๆ คร คร ครา คราใด คราท คราน คราน คราหน
คราไหน คราว คราวกอน คราวใด คราวท คราวน คราวน คราวโน คราวละ คราวหน คราวหน คราวหล คราวไหน คราวๆ คลาย คลายก คลายกนก
คลายก คลายกบว คลายว ควร อน อนขาง อนขางจะ อยไปทาง อนมาทาง อย อยๆ คะ คำ ดว ณๆ
เคยๆ แค แคจะ แค แค แคเพยง แค แคไหน ใคร ใครจะ าย ายๆ จนกว จนแม จนแม งๆ จวบก จวบจน จะได ดการ ดงาน ดแจง
ดต ดทำ ดหา ดให จากน จากน จากนไป จำ จำเป จำพวก งจะ งเป ฉะน ฉะน เฉกเช เฉย เฉยๆ ไฉน วงกอน
วงตอไป วงถดไป วงทาย วงท วงน วงน วงระหวาง วงแรก วงหน วงหล วงๆ วย านาน ชาว าๆ เชนกอน เชนก เชนเคย
เชนด เชนดงกอน เชนดงเก เชนดงท เชนดงว เชนเดยวก เชนเดยวก เชนใด เชนท เชนทเคย เชนท เชนน เชนนนเอง เชนน เชนเม เชนไร เช
เชอถ เชอม เชอว ใช ใชไหม ใช ซะ ซะกอน ซะจน ซะจนกระท ซะจนถ งไดแก วยก วยเชนก วยท วยเพราะ วยว วยเหต วยเหต
วยเหต วยเหตเพราะ วยเหต วยเหมอนก งกลาว งกบว งกบว งเก งเก งเคย างก างหาก ตามดวย ตามแต ตามท
ตามๆ เตมไปดวย เตมไปหมด เตมๆ แต แตอน แตจะ แตเด แตอง แต แตทว แต แต แตเพยง แตเม แตไร แตละ แต แตไหน แตอยางใด โต
โตๆ ใต าจะ าหาก งแก งแม งแมจะ งแม งอยางไร อว กตอง ทว งนนดวย งปวง งเป งมวล งส งหมด งหลาย งๆ
นใดน นท นทนใด ทำไม ทำไร ทำให ทำๆ จร เดยว ใด ใด ได เถอะ แท แทจร ไร ละ ละ
แล แหงน ไหน กคน กคร กครา กคราว กช กต กทาง กท กท กเม กว กวนน กส กหน กแห กอยาง
กอ กๆ เท เทาก เทาก เทาใด เทาท เทาน เทาน เทาไร เทาไหร แท แทจร เธอ นอกจากว อย อยกว อยๆ นไว บแต นาง
นางสาว าจะ นาน นานๆ นาย นำ นำพา นำมา ดหนอย ดๆ ไง นา แน แหละ แหล เอง เอง เน เน
เนยเอง ในชวง ในท ในเม ในระหวาง บน บอก บอกแล บอกว อย อยกว อยคร อยๆ ดดล ดเดยวน ดน ดน าง บางกว
บางขณะ บางคร บางครา บางคราว บางท บางท บางแห บางๆ ปฏ ประกอบ ประการ ประการฉะน ประการใด ประการหน ประมาณ ประสบ ปร
ปรากฏ ปรากฏว จจ เปนดวย เปนด เปนต เปนแต เปนเพ เปนอ เปนอนมาก เปนอาท านๆ ใด เผ เผอจะ เผอท เผอว าย
ายใด พบว พยายาม พรอมก พรอมก พรอมดวย พรอมท พรอมท พรอมเพยง พวก พวกก พวกก พวกแก พวกเขา พวกค พวกฉ พวกทาน
พวกท พวกเธอ พวกน พวกน พวกน พวกโน พวกม พวกม พอ พอก พอควร พอจะ พอด พอต พอท พอท พอเพยง พอแล พอสม พอสมควร
พอเหมาะ พอๆ พา นๆ เพราะฉะน เพราะว เพ เพงจะ เพ เพมเต เพยง เพยงแค เพยงใด เพยงแต เพยงพอ เพยงเพราะ
เพอว เพอให ภายใต มองว มากกว มากมาย ฉะน ใช ได แต งเน งหมาย เมอกอน เมอคร เมอครงกอน
เมอคราวกอน เมอคราวท เมอคราว เมอค เมอเช เมอใด เมอน เมอน เมอเย เมอไร เมอวนวาน เมอวาน เมอไหร แม แมกระท แมแต แมนว แม
ไมอย ไมอยจะ ไมอยเป ไมใช ไมเปนไร ไม ยก ยกให ยอม ยอมร อม อย งคง งง งง งโง งไง งจะ งแต ยาก
ยาว ยาวนาน งกว งข งขนไป งจน งจะ งน งเม งแล งใหญ วมก รวมดวย วมดวย อว เร เรวๆ เราๆ เรยก เรยบ เรอย
เรอยๆ ไร วน วนจน วนแต ละ าส เล เลกนอย เลกๆ เลาว แลวก แลวแต แลวเสร นใด นน นน นไหน สบาย สม สมยกอน
สมยน สมยน สมยโน วนเก วนดอย วนด วนใด วนท วนนอย วนน วนมาก วนใหญ นๆ สามารถ สำค
งใด งน งน งไหน เสรจแล เสยดวย เสยแล แสดง แสดงว หน หนอ หนอย หนอย หมด หมดก หมดส หรอไง หรอเปล หรอไม หรอย
หรอไร หากแม หากแม หากแมนว หากว หาความ หาใช หาร เหต เหตผล เหต เหต เหตไร เหนแก เหนควร เหนจะ เหนว เหล เหลอเก เหล
เหลาน เหลาน แหงใด แหงน แหงน แหงโน แหงไหน แหละ ใหแก ใหญ ใหญโต อยางเช อยางด อยางเดยว อยางใด อยางท อยางนอย อยางน อยางน
อยางโน อยางมาก อยางย อยางไร อยางไรก อยางไรกได อยางไรเส อยางละ อยางหน อยางไหน อยางๆ นจะ นใด นไดแก นท
นทจร นทจะ นเนองมาจาก นละ นไหน นๆ อาจจะ อาจเป อาจเปนดวย นๆ เอ เอา ฯล ฯลฯ
""".split())

81
spacy/lang/th/tag_map.py Normal file
View File

@ -0,0 +1,81 @@
# encoding: utf8
# data from Korakot Chaovavanich (https://www.facebook.com/photo.php?fbid=390564854695031&set=p.390564854695031&type=3&permPage=1&ifg=1)
from __future__ import unicode_literals
from ...symbols import *
TAG_MAP = {
#NOUN
"NOUN": {POS: NOUN},
"NCMN": {POS: NOUN},
"NTTL": {POS: NOUN},
"CNIT": {POS: NOUN},
"CLTV": {POS: NOUN},
"CMTR": {POS: NOUN},
"CFQC": {POS: NOUN},
"CVBL": {POS: NOUN},
#PRON
"PRON": {POS: PRON},
"NPRP": {POS: PRON},
# ADJ
"ADJ": {POS: ADJ},
"NONM": {POS: ADJ},
"VATT": {POS: ADJ},
"DONM": {POS: ADJ},
# ADV
"ADV": {POS: ADV},
"ADVN": {POS: ADV},
"ADVI": {POS: ADV},
"ADVP": {POS: ADV},
"ADVS": {POS: ADV},
# INT
"INT": {POS: INTJ},
# PRON
"PROPN": {POS: PROPN},
"PPRS": {POS: PROPN},
"PDMN": {POS: PROPN},
"PNTR": {POS: PROPN},
# DET
"DET": {POS: DET},
"DDAN": {POS: DET},
"DDAC": {POS: DET},
"DDBQ": {POS: DET},
"DDAQ": {POS: DET},
"DIAC": {POS: DET},
"DIBQ": {POS: DET},
"DIAQ": {POS: DET},
"DCNM": {POS: DET},
# NUM
"NUM": {POS: NUM},
"NCNM": {POS: NUM},
"NLBL": {POS: NUM},
"DCNM": {POS: NUM},
# AUX
"AUX": {POS: AUX},
"XVBM": {POS: AUX},
"XVAM": {POS: AUX},
"XVMM": {POS: AUX},
"XVBB": {POS: AUX},
"XVAE": {POS: AUX},
# ADP
"ADP": {POS: ADP},
"RPRE": {POS: ADP},
# CCONJ
"CCONJ": {POS: CCONJ},
"JCRG": {POS: CCONJ},
# SCONJ
"SCONJ": {POS: SCONJ},
"PREL": {POS: SCONJ},
"JSBR": {POS: SCONJ},
"JCMP": {POS: SCONJ},
# PART
"PART": {POS: PART},
"FIXN": {POS: PART},
"FIXV": {POS: PART},
"EAFF": {POS: PART},
"AITT": {POS: PART},
"NEG": {POS: PART},
# PUNCT
"PUNCT": {POS: PUNCT},
"PUNC": {POS: PUNCT}
}

View File

@ -0,0 +1,43 @@
# encoding: utf8
from __future__ import unicode_literals
from ...symbols import *
TOKENIZER_EXCEPTIONS = {
"ม.ค.": [
{ORTH: "ม.ค.", LEMMA: "มกราคม"}
],
"ก.พ.": [
{ORTH: "ก.พ.", LEMMA: "กุมภาพันธ์"}
],
"มี.ค.": [
{ORTH: "มี.ค.", LEMMA: "มีนาคม"}
],
"เม.ย.": [
{ORTH: "เม.ย.", LEMMA: "เมษายน"}
],
"พ.ค.": [
{ORTH: "พ.ค.", LEMMA: "พฤษภาคม"}
],
"มิ.ย.": [
{ORTH: "มิ.ย.", LEMMA: "มิถุนายน"}
],
"ก.ค.": [
{ORTH: "ก.ค.", LEMMA: "กรกฎาคม"}
],
"ส.ค.": [
{ORTH: "ส.ค.", LEMMA: "สิงหาคม"}
],
"ก.ย.": [
{ORTH: "ก.ย.", LEMMA: "กันยายน"}
],
"ต.ค.": [
{ORTH: "ต.ค.", LEMMA: "ตุลาคม"}
],
"พ.ย.": [
{ORTH: "พ.ย.", LEMMA: "พฤศจิกายน"}
],
"ธ.ค.": [
{ORTH: "ธ.ค.", LEMMA: "ธันวาคม"}
]
}

View File

@ -14,8 +14,8 @@ class Chinese(Language):
except ImportError:
raise ImportError("The Chinese tokenizer requires the Jieba library: "
"https://github.com/fxsjy/jieba")
words = list(jieba.cut(text, cut_all=True))
words=[x for x in words if x]
words = list(jieba.cut(text, cut_all=False))
words = [x for x in words if x]
return Doc(self.vocab, words=words, spaces=[False]*len(words))

View File

@ -34,6 +34,7 @@ from .lang.tag_map import TAG_MAP
from .lang.lex_attrs import LEX_ATTRS
from . import util
from .scorer import Scorer
from ._ml import link_vectors_to_models
class BaseDefaults(object):
@ -278,8 +279,7 @@ class Language(object):
def make_doc(self, text):
return self.tokenizer(text)
def update(self, docs, golds, drop=0., sgd=None, losses=None,
update_shared=False):
def update(self, docs, golds, drop=0., sgd=None, losses=None):
"""Update the models in the pipeline.
docs (iterable): A batch of `Doc` objects.
@ -303,32 +303,17 @@ class Language(object):
if self._optimizer is None:
self._optimizer = Adam(Model.ops, 0.001)
sgd = self._optimizer
tok2vec = self.pipeline[0]
feats = tok2vec.doc2feats(docs)
grads = {}
def get_grads(W, dW, key=None):
grads[key] = (W, dW)
pipes = list(self.pipeline[1:])
pipes = list(self.pipeline)
random.shuffle(pipes)
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
all_d_tokvecses = [tok2vec.model.ops.allocate(tv.shape) for tv in tokvecses]
for proc in pipes:
if not hasattr(proc, 'update'):
continue
d_tokvecses = proc.update((docs, tokvecses), golds,
drop=drop, sgd=get_grads, losses=losses)
if update_shared and d_tokvecses is not None:
for i, d_tv in enumerate(d_tokvecses):
all_d_tokvecses[i] += d_tv
if update_shared and bp_tokvecses is not None:
bp_tokvecses(all_d_tokvecses, sgd=sgd)
proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses)
for key, (W, dW) in grads.items():
sgd(W, dW, key=key)
# Clear the tensor variable, to free GPU memory.
# If we don't do this, the memory leak gets pretty
# bad, because we may be holding part of a batch.
for doc in docs:
doc.tensor = None
def preprocess_gold(self, docs_golds):
"""Can be called before training to pre-process gold data. By default,
@ -343,36 +328,49 @@ class Language(object):
for doc, gold in docs_golds:
yield doc, gold
def begin_training(self, get_gold_tuples, **cfg):
def resume_training(self, **cfg):
if cfg.get('device', -1) >= 0:
device = util.use_gpu(cfg['device'])
if self.vocab.vectors.data.shape[1] >= 1:
self.vocab.vectors.data = Model.ops.asarray(
self.vocab.vectors.data)
else:
device = None
learn_rate = util.env_opt('learn_rate', 0.001)
beta1 = util.env_opt('optimizer_B1', 0.9)
beta2 = util.env_opt('optimizer_B2', 0.999)
eps = util.env_opt('optimizer_eps', 1e-08)
L2 = util.env_opt('L2_penalty', 1e-6)
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
beta2=beta2, eps=eps)
self._optimizer.max_grad_norm = max_grad_norm
self._optimizer.device = device
return self._optimizer
def begin_training(self, get_gold_tuples=None, **cfg):
"""Allocate models, pre-process training data and acquire a trainer and
optimizer. Used as a contextmanager.
gold_tuples (iterable): Gold-standard training data.
get_gold_tuples (function): Function returning gold data
**cfg: Config parameters.
YIELDS (tuple): A trainer and an optimizer.
EXAMPLE:
>>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
>>> for epoch in trainer.epochs(gold):
>>> for docs, golds in epoch:
>>> state = nlp.update(docs, golds, sgd=optimizer)
returns: An optimizer
"""
if self.parser:
self.pipeline.append(NeuralLabeller(self.vocab))
# Populate vocab
for _, annots_brackets in get_gold_tuples():
for annots, _ in annots_brackets:
for word in annots[1]:
_ = self.vocab[word]
if get_gold_tuples is not None:
for _, annots_brackets in get_gold_tuples():
for annots, _ in annots_brackets:
for word in annots[1]:
_ = self.vocab[word]
contexts = []
if cfg.get('device', -1) >= 0:
import cupy.cuda.device
device = cupy.cuda.device.Device(cfg['device'])
device.use()
Model.ops = CupyOps()
Model.Ops = CupyOps
device = util.use_gpu(cfg['device'])
if self.vocab.vectors.data.shape[1] >= 1:
self.vocab.vectors.data = Model.ops.asarray(
self.vocab.vectors.data)
else:
device = None
link_vectors_to_models(self.vocab)
for proc in self.pipeline:
if hasattr(proc, 'begin_training'):
context = proc.begin_training(get_gold_tuples(),
@ -390,7 +388,7 @@ class Language(object):
self._optimizer.device = device
return self._optimizer
def evaluate(self, docs_golds):
def evaluate(self, docs_golds, verbose=False):
scorer = Scorer()
docs, golds = zip(*docs_golds)
docs = list(docs)
@ -403,8 +401,9 @@ class Language(object):
docs = list(pipe.pipe(docs))
assert len(docs) == len(golds)
for doc, gold in zip(docs, golds):
scorer.score(doc, gold)
doc.tensor = None
if verbose:
print(doc)
scorer.score(doc, gold, verbose=verbose)
return scorer
@contextmanager
@ -493,7 +492,6 @@ class Language(object):
"""
path = util.ensure_path(path)
serializers = OrderedDict((
('vocab', lambda p: self.vocab.to_disk(p)),
('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)),
('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
))
@ -505,6 +503,7 @@ class Language(object):
if not hasattr(proc, 'to_disk'):
continue
serializers[proc.name] = lambda p, proc=proc: proc.to_disk(p, vocab=False)
serializers['vocab'] = lambda p: self.vocab.to_disk(p)
util.to_disk(path, serializers, {p: False for p in disable})
def from_disk(self, path, disable=tuple()):

View File

@ -38,7 +38,8 @@ class Lemmatizer(object):
avoid lemmatization entirely.
"""
morphology = {} if morphology is None else morphology
others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
others = [key for key in morphology
if key not in (POS, 'Number', 'POS', 'VerbForm', 'Tense')]
true_morph_key = morphology.get('morph', 0)
if univ_pos == 'noun' and morphology.get('Number') == 'sing':
return True
@ -47,7 +48,9 @@ class Lemmatizer(object):
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
# morphology
elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and \
morphology.get('Tense') == 'pres'):
morphology.get('Tense') == 'pres' and \
morphology.get('Number') is None and \
not others):
return True
elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
return True

View File

@ -421,47 +421,69 @@ cdef class PhraseMatcher:
cdef int max_length
cdef attr_t* _phrase_key
def __init__(self, Vocab vocab, phrases, max_length=10):
cdef public object _callbacks
cdef public object _patterns
def __init__(self, Vocab vocab, max_length=10):
self.mem = Pool()
self._phrase_key = <attr_t*>self.mem.alloc(max_length, sizeof(attr_t))
self.max_length = max_length
self.vocab = vocab
self.matcher = Matcher(self.vocab, {})
self.matcher = Matcher(self.vocab)
self.phrase_ids = PreshMap()
for phrase in phrases:
if len(phrase) < max_length:
self.add(phrase)
abstract_patterns = []
for length in range(1, max_length):
abstract_patterns.append([{tag: True} for tag in get_bilou(length)])
self.matcher.add('Candidate', 'MWE', {}, abstract_patterns, acceptor=self.accept_match)
self.matcher.add('Candidate', None, *abstract_patterns)
self._callbacks = {}
def add(self, Doc tokens):
cdef int length = tokens.length
assert length < self.max_length
tags = get_bilou(length)
assert len(tags) == length, length
def __len__(self):
raise NotImplementedError
def __contains__(self, key):
raise NotImplementedError
def __reduce__(self):
return (self.__class__, (self.vocab,), None, None)
def add(self, key, on_match, *docs):
cdef Doc doc
for doc in docs:
if len(doc) >= self.max_length:
msg = (
"Pattern length (%d) >= phrase_matcher.max_length (%d). "
"Length can be set on initialization, up to 10."
)
raise ValueError(msg % (len(doc), self.max_length))
cdef hash_t ent_id = self.matcher._normalize_key(key)
self._callbacks[ent_id] = on_match
cdef int length
cdef int i
for i in range(self.max_length):
self._phrase_key[i] = 0
for i, tag in enumerate(tags):
lexeme = self.vocab[tokens.c[i].lex.orth]
lexeme.set_flag(tag, True)
self._phrase_key[i] = lexeme.orth
cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0)
self.phrase_ids[key] = True
cdef hash_t phrase_hash
for doc in docs:
length = doc.length
tags = get_bilou(length)
for i in range(self.max_length):
self._phrase_key[i] = 0
for i, tag in enumerate(tags):
lexeme = self.vocab[doc.c[i].lex.orth]
lexeme.set_flag(tag, True)
self._phrase_key[i] = lexeme.orth
phrase_hash = hash64(self._phrase_key,
self.max_length * sizeof(attr_t), 0)
self.phrase_ids.set(phrase_hash, <void*>ent_id)
def __call__(self, Doc doc):
matches = []
for ent_id, label, start, end in self.matcher(doc):
cand = doc[start : end]
start = cand[0].idx
end = cand[-1].idx + len(cand[-1])
matches.append((start, end, cand.root.tag_, cand.text, 'MWE'))
for match in matches:
doc.merge(*match)
for _, start, end in self.matcher(doc):
ent_id = self.accept_match(doc, start, end)
if ent_id is not None:
matches.append((ent_id, start, end))
for i, (ent_id, start, end) in enumerate(matches):
on_match = self._callbacks.get(ent_id)
if on_match is not None:
on_match(self, doc, i, matches)
return matches
def pipe(self, stream, batch_size=1000, n_threads=2):
@ -469,7 +491,7 @@ cdef class PhraseMatcher:
self(doc)
yield doc
def accept_match(self, Doc doc, attr_t ent_id, attr_t label, int start, int end):
def accept_match(self, Doc doc, int start, int end):
assert (end - start) < self.max_length
cdef int i, j
for i in range(self.max_length):
@ -477,7 +499,8 @@ cdef class PhraseMatcher:
for i, j in enumerate(range(start, end)):
self._phrase_key[i] = doc.c[j].lex.orth
cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0)
if self.phrase_ids.get(key):
return (ent_id, label, start, end)
ent_id = <hash_t>self.phrase_ids.get(key)
if ent_id == 0:
return None
else:
return False
return ent_id

View File

@ -146,6 +146,8 @@ cdef class Morphology:
self.add_special_case(tag_str, form_str, attrs)
def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
if orth not in self.strings:
return orth
cdef unicode py_string = self.strings[orth]
if self.lemmatizer is None:
return self.strings.add(py_string.lower())

View File

@ -4,7 +4,6 @@
from __future__ import unicode_literals
from thinc.api import chain, layerize, with_getitem
from thinc.neural import Model, Softmax
import numpy
cimport numpy as np
import cytoolz
@ -14,17 +13,18 @@ import ujson
import msgpack
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
from thinc.neural import Model, Maxout, Softmax, Affine
from thinc.neural._classes.hash_embed import HashEmbed
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
from thinc.i2v import HashEmbed
from thinc.t2v import Pooling, max_pool, mean_pool, sum_pool
from thinc.t2t import ExtractWindow, ParametricAttention
from thinc.misc import Residual
from thinc.misc import BatchNorm as BN
from thinc.misc import LayerNorm as LN
from thinc.neural.util import to_categorical
from thinc.neural.pooling import Pooling, max_pool, mean_pool
from thinc.neural._classes.difference import Siamese, CauchySimilarity
from thinc.neural._classes.convolution import ExtractWindow
from thinc.neural._classes.resnet import Residual
from thinc.neural._classes.batchnorm import BatchNorm as BN
from .tokens.doc cimport Doc
from .syntax.parser cimport Parser as LinearParser
from .syntax.nn_parser cimport Parser as NeuralParser
@ -41,13 +41,14 @@ from .syntax import nonproj
from .compat import json_dumps
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats
from ._ml import rebatch, Tok2Vec, flatten
from ._ml import build_text_classifier, build_tagger_model
from ._ml import link_vectors_to_models
from .parts_of_speech import X
class SentenceSegmenter(object):
'''A simple spaCy hook, to allow custom sentence boundary detection logic
"""A simple spaCy hook, to allow custom sentence boundary detection logic
(that doesn't require the dependency parse).
To change the sentence boundary detection strategy, pass a generator
@ -56,7 +57,7 @@ class SentenceSegmenter(object):
Sentence detection strategies should be generators that take `Doc` objects
and yield `Span` objects for each sentence.
'''
"""
name = 'sbd'
def __init__(self, vocab, strategy=None):
@ -88,17 +89,30 @@ class BaseThincComponent(object):
@classmethod
def Model(cls, *shape, **kwargs):
"""Initialize a model for the pipe."""
raise NotImplementedError
def __init__(self, vocab, model=True, **cfg):
"""Create a new pipe instance."""
raise NotImplementedError
def __call__(self, doc):
"""Apply the pipe to one document. The document is
modified in-place, and returned.
Both __call__ and pipe should delegate to the `predict()`
and `set_annotations()` methods.
"""
scores = self.predict([doc])
self.set_annotations([doc], scores)
return doc
def pipe(self, stream, batch_size=128, n_threads=-1):
"""Apply the pipe to a stream of documents.
Both __call__ and pipe should delegate to the `predict()`
and `set_annotations()` methods.
"""
for docs in cytoolz.partition_all(batch_size, stream):
docs = list(docs)
scores = self.predict(docs)
@ -106,27 +120,43 @@ class BaseThincComponent(object):
yield from docs
def predict(self, docs):
"""Apply the pipeline's model to a batch of docs, without
modifying them.
"""
raise NotImplementedError
def set_annotations(self, docs, scores):
"""Modify a batch of documents, using pre-computed scores."""
raise NotImplementedError
def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None):
def update(self, docs, golds, drop=0., sgd=None, losses=None):
"""Learn from a batch of documents and gold-standard information,
updating the pipe's model.
Delegates to predict() and get_loss().
"""
raise NotImplementedError
def get_loss(self, docs, golds, scores):
"""Find the loss and gradient of loss for the batch of
documents and their predicted scores."""
raise NotImplementedError
def begin_training(self, gold_tuples=tuple(), pipeline=None):
token_vector_width = pipeline[0].model.nO
"""Initialize the pipe for training, using data exampes if available.
If no model has been initialized yet, the model is added."""
if self.model is True:
self.model = self.Model(1, token_vector_width)
self.model = self.Model(**self.cfg)
link_vectors_to_models(self.vocab)
def use_params(self, params):
"""Modify the pipe's model, to use the given parameter values.
"""
with self.model.use_params(params):
yield
def to_bytes(self, **exclude):
"""Serialize the pipe to a bytestring."""
serialize = OrderedDict((
('cfg', lambda: json_dumps(self.cfg)),
('model', lambda: self.model.to_bytes()),
@ -135,37 +165,42 @@ class BaseThincComponent(object):
return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, **exclude):
"""Load the pipe from a bytestring."""
def load_model(b):
if self.model is True:
self.cfg['pretrained_dims'] = self.vocab.vectors_length
self.model = self.Model(**self.cfg)
self.model.from_bytes(b)
deserialize = OrderedDict((
('cfg', lambda b: self.cfg.update(ujson.loads(b))),
('vocab', lambda b: self.vocab.from_bytes(b)),
('model', load_model),
('vocab', lambda b: self.vocab.from_bytes(b))
))
util.from_bytes(bytes_data, deserialize, exclude)
return self
def to_disk(self, path, **exclude):
"""Serialize the pipe to disk."""
serialize = OrderedDict((
('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))),
('vocab', lambda p: self.vocab.to_disk(p)),
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
('vocab', lambda p: self.vocab.to_disk(p))
))
util.to_disk(path, serialize, exclude)
def from_disk(self, path, **exclude):
"""Load the pipe from disk."""
def load_model(p):
if self.model is True:
self.cfg['pretrained_dims'] = self.vocab.vectors_length
self.model = self.Model(**self.cfg)
self.model.from_bytes(p.open('rb').read())
deserialize = OrderedDict((
('cfg', lambda p: self.cfg.update(_load_cfg(p))),
('model', load_model),
('vocab', lambda p: self.vocab.from_disk(p)),
('model', load_model),
))
util.from_disk(path, deserialize, exclude)
return self
@ -193,7 +228,7 @@ class TokenVectorEncoder(BaseThincComponent):
"""
width = util.env_opt('token_vector_width', width)
embed_size = util.env_opt('embed_size', embed_size)
return Tok2Vec(width, embed_size, preprocess=None)
return Tok2Vec(width, embed_size, **cfg)
def __init__(self, vocab, model=True, **cfg):
"""Construct a new statistical model. Weights are not allocated on
@ -210,9 +245,10 @@ class TokenVectorEncoder(BaseThincComponent):
>>> tok2vec.model = tok2vec.Model(128, 5000)
"""
self.vocab = vocab
self.doc2feats = doc2feats()
self.model = model
self.cfg = dict(cfg)
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
self.cfg.setdefault('cnn_maxout_pieces', 3)
def __call__(self, doc):
"""Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
@ -245,8 +281,7 @@ class TokenVectorEncoder(BaseThincComponent):
docs (iterable): A sequence of `Doc` objects.
RETURNS (object): Vector representations for each token in the documents.
"""
feats = self.doc2feats(docs)
tokvecs = self.model(feats)
tokvecs = self.model(docs)
return tokvecs
def set_annotations(self, docs, tokvecses):
@ -270,8 +305,7 @@ class TokenVectorEncoder(BaseThincComponent):
"""
if isinstance(docs, Doc):
docs = [docs]
feats = self.doc2feats(docs)
tokvecs, bp_tokvecs = self.model.begin_update(feats, drop=drop)
tokvecs, bp_tokvecs = self.model.begin_update(docs, drop=drop)
return tokvecs, bp_tokvecs
def get_loss(self, docs, golds, scores):
@ -285,9 +319,10 @@ class TokenVectorEncoder(BaseThincComponent):
gold_tuples (iterable): Gold-standard training data.
pipeline (list): The pipeline the model is part of.
"""
self.doc2feats = doc2feats()
if self.model is True:
self.model = self.Model()
self.cfg['pretrained_dims'] = self.vocab.vectors_length
self.model = self.Model(**self.cfg)
link_vectors_to_models(self.vocab)
class NeuralTagger(BaseThincComponent):
@ -296,29 +331,29 @@ class NeuralTagger(BaseThincComponent):
self.vocab = vocab
self.model = model
self.cfg = dict(cfg)
self.cfg.setdefault('cnn_maxout_pieces', 2)
self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
def __call__(self, doc):
tags = self.predict(([doc], [doc.tensor]))
tags = self.predict([doc])
self.set_annotations([doc], tags)
return doc
def pipe(self, stream, batch_size=128, n_threads=-1):
for docs in cytoolz.partition_all(batch_size, stream):
docs = list(docs)
tokvecs = [d.tensor for d in docs]
tag_ids = self.predict((docs, tokvecs))
tag_ids = self.predict(docs)
self.set_annotations(docs, tag_ids)
yield from docs
def predict(self, docs_tokvecs):
scores = self.model(docs_tokvecs)
def predict(self, docs):
scores = self.model(docs)
scores = self.model.ops.flatten(scores)
guesses = scores.argmax(axis=1)
if not isinstance(guesses, numpy.ndarray):
guesses = guesses.get()
tokvecs = docs_tokvecs[1]
guesses = self.model.ops.unflatten(guesses,
[tv.shape[0] for tv in tokvecs])
[len(d) for d in docs])
return guesses
def set_annotations(self, docs, batch_tag_ids):
@ -338,20 +373,16 @@ class NeuralTagger(BaseThincComponent):
idx += 1
doc.is_tagged = True
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
def update(self, docs, golds, drop=0., sgd=None, losses=None):
if losses is not None and self.name not in losses:
losses[self.name] = 0.
docs, tokvecs = docs_tokvecs
if self.model.nI is None:
self.model.nI = tokvecs[0].shape[1]
tag_scores, bp_tag_scores = self.model.begin_update(docs_tokvecs, drop=drop)
tag_scores, bp_tag_scores = self.model.begin_update(docs, drop=drop)
loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
bp_tag_scores(d_tag_scores, sgd=sgd)
d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
if losses is not None:
losses[self.name] += loss
return d_tokvecs
def get_loss(self, docs, golds, scores):
scores = self.model.ops.flatten(scores)
@ -392,14 +423,15 @@ class NeuralTagger(BaseThincComponent):
vocab.morphology = Morphology(vocab.strings, new_tag_map,
vocab.morphology.lemmatizer,
exc=vocab.morphology.exc)
token_vector_width = pipeline[0].model.nO
if self.model is True:
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
link_vectors_to_models(self.vocab)
@classmethod
def Model(cls, n_tags, token_vector_width):
return build_tagger_model(n_tags, token_vector_width)
def Model(cls, n_tags, **cfg):
return build_tagger_model(n_tags, **cfg)
def use_params(self, params):
with self.model.use_params(params):
yield
@ -419,7 +451,7 @@ class NeuralTagger(BaseThincComponent):
if self.model is True:
token_vector_width = util.env_opt('token_vector_width',
self.cfg.get('token_vector_width', 128))
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
self.model.from_bytes(b)
def load_tag_map(b):
@ -428,7 +460,7 @@ class NeuralTagger(BaseThincComponent):
self.vocab.strings, tag_map=tag_map,
lemmatizer=self.vocab.morphology.lemmatizer,
exc=self.vocab.morphology.exc)
deserialize = OrderedDict((
('vocab', lambda b: self.vocab.from_bytes(b)),
('tag_map', load_tag_map),
@ -438,6 +470,7 @@ class NeuralTagger(BaseThincComponent):
return self
def to_disk(self, path, **exclude):
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
serialize = OrderedDict((
('vocab', lambda p: self.vocab.to_disk(p)),
('tag_map', lambda p: p.open('wb').write(msgpack.dumps(
@ -452,9 +485,7 @@ class NeuralTagger(BaseThincComponent):
def from_disk(self, path, **exclude):
def load_model(p):
if self.model is True:
token_vector_width = util.env_opt('token_vector_width',
self.cfg.get('token_vector_width', 128))
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
self.model.from_bytes(p.open('rb').read())
def load_tag_map(p):
@ -466,10 +497,10 @@ class NeuralTagger(BaseThincComponent):
exc=self.vocab.morphology.exc)
deserialize = OrderedDict((
('cfg', lambda p: self.cfg.update(_load_cfg(p))),
('vocab', lambda p: self.vocab.from_disk(p)),
('tag_map', load_tag_map),
('model', load_model),
('cfg', lambda p: self.cfg.update(_load_cfg(p)))
))
util.from_disk(path, deserialize, exclude)
return self
@ -477,10 +508,28 @@ class NeuralTagger(BaseThincComponent):
class NeuralLabeller(NeuralTagger):
name = 'nn_labeller'
def __init__(self, vocab, model=True, **cfg):
def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
self.vocab = vocab
self.model = model
if target == 'dep':
self.make_label = self.make_dep
elif target == 'tag':
self.make_label = self.make_tag
elif target == 'ent':
self.make_label = self.make_ent
elif target == 'dep_tag_offset':
self.make_label = self.make_dep_tag_offset
elif target == 'ent_tag':
self.make_label = self.make_ent_tag
elif hasattr(target, '__call__'):
self.make_label = target
else:
raise ValueError(
"NeuralLabeller target should be function or one of "
"['dep', 'tag', 'ent', 'dep_tag_offset', 'ent_tag']")
self.cfg = dict(cfg)
self.cfg.setdefault('cnn_maxout_pieces', 2)
self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
@property
def labels(self):
@ -493,41 +542,79 @@ class NeuralLabeller(NeuralTagger):
def set_annotations(self, docs, dep_ids):
pass
def begin_training(self, gold_tuples=tuple(), pipeline=None):
def begin_training(self, gold_tuples=tuple(), pipeline=None, tok2vec=None):
gold_tuples = nonproj.preprocess_training_data(gold_tuples)
for raw_text, annots_brackets in gold_tuples:
for annots, brackets in annots_brackets:
ids, words, tags, heads, deps, ents = annots
for dep in deps:
if dep not in self.labels:
self.labels[dep] = len(self.labels)
token_vector_width = pipeline[0].model.nO
for i in range(len(ids)):
label = self.make_label(i, words, tags, heads, deps, ents)
if label is not None and label not in self.labels:
self.labels[label] = len(self.labels)
print(len(self.labels))
if self.model is True:
self.model = self.Model(len(self.labels), token_vector_width)
token_vector_width = util.env_opt('token_vector_width')
self.model = chain(
tok2vec,
Softmax(len(self.labels), token_vector_width)
)
link_vectors_to_models(self.vocab)
@classmethod
def Model(cls, n_tags, token_vector_width):
return build_tagger_model(n_tags, token_vector_width)
def Model(cls, n_tags, tok2vec=None, **cfg):
return build_tagger_model(n_tags, tok2vec=tok2vec, **cfg)
def get_loss(self, docs, golds, scores):
scores = self.model.ops.flatten(scores)
cdef int idx = 0
correct = numpy.zeros((scores.shape[0],), dtype='i')
guesses = scores.argmax(axis=1)
for gold in golds:
for tag in gold.labels:
if tag is None or tag not in self.labels:
for i in range(len(gold.labels)):
label = self.make_label(i, gold.words, gold.tags, gold.heads,
gold.labels, gold.ents)
if label is None or label not in self.labels:
correct[idx] = guesses[idx]
else:
correct[idx] = self.labels[tag]
correct[idx] = self.labels[label]
idx += 1
correct = self.model.ops.xp.array(correct, dtype='i')
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
d_scores /= d_scores.shape[0]
loss = (d_scores**2).sum()
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
return float(loss), d_scores
@staticmethod
def make_dep(i, words, tags, heads, deps, ents):
if deps[i] is None or heads[i] is None:
return None
return deps[i]
@staticmethod
def make_tag(i, words, tags, heads, deps, ents):
return tags[i]
@staticmethod
def make_ent(i, words, tags, heads, deps, ents):
if ents is None:
return None
return ents[i]
@staticmethod
def make_dep_tag_offset(i, words, tags, heads, deps, ents):
if deps[i] is None or heads[i] is None:
return None
offset = heads[i] - i
offset = min(offset, 2)
offset = max(offset, -2)
return '%s-%s:%d' % (deps[i], tags[i], offset)
@staticmethod
def make_ent_tag(i, words, tags, heads, deps, ents):
if ents is None or ents[i] is None:
return None
else:
return '%s-%s' % (tags[i], ents[i])
class SimilarityHook(BaseThincComponent):
"""
@ -555,7 +642,7 @@ class SimilarityHook(BaseThincComponent):
return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length))
def __call__(self, doc):
'''Install similarity hook'''
"""Install similarity hook"""
doc.user_hooks['similarity'] = self.predict
return doc
@ -564,15 +651,10 @@ class SimilarityHook(BaseThincComponent):
yield self(doc)
def predict(self, doc1, doc2):
return self.model.predict([(doc1.tensor, doc2.tensor)])
return self.model.predict([(doc1, doc2)])
def update(self, doc1_tensor1_doc2_tensor2, golds, sgd=None, drop=0.):
doc1s, tensor1s, doc2s, tensor2s = doc1_tensor1_doc2_tensor2
sims, bp_sims = self.model.begin_update(zip(tensor1s, tensor2s),
drop=drop)
d_tensor1s, d_tensor2s = bp_sims(golds, sgd=sgd)
return d_tensor1s, d_tensor2s
def update(self, doc1_doc2, golds, sgd=None, drop=0.):
sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop)
def begin_training(self, _=tuple(), pipeline=None):
"""
@ -583,6 +665,7 @@ class SimilarityHook(BaseThincComponent):
"""
if self.model is True:
self.model = self.Model(pipeline[0].model.nO)
link_vectors_to_models(self.vocab)
class TextCategorizer(BaseThincComponent):
@ -627,15 +710,13 @@ class TextCategorizer(BaseThincComponent):
for j, label in enumerate(self.labels):
doc.cats[label] = float(scores[i, j])
def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None):
docs, tensors = docs_tensors
def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
scores, bp_scores = self.model.begin_update(docs, drop=drop)
loss, d_scores = self.get_loss(docs, golds, scores)
d_tensors = bp_scores(d_scores, sgd=sgd)
bp_scores(d_scores, sgd=sgd)
if losses is not None:
losses.setdefault(self.name, 0.0)
losses[self.name] += loss
return d_tensors
def get_loss(self, docs, golds, scores):
truths = numpy.zeros((len(golds), len(self.labels)), dtype='f')
@ -653,8 +734,10 @@ class TextCategorizer(BaseThincComponent):
else:
token_vector_width = 64
if self.model is True:
self.cfg['pretrained_dims'] = self.vocab.vectors_length
self.model = self.Model(len(self.labels), token_vector_width,
**self.cfg)
link_vectors_to_models(self.vocab)
cdef class EntityRecognizer(LinearParser):
@ -695,6 +778,14 @@ cdef class NeuralDependencyParser(NeuralParser):
name = 'parser'
TransitionSystem = ArcEager
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
for target in []:
labeller = NeuralLabeller(self.vocab, target=target)
tok2vec = self.model[0]
labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
pipeline.append(labeller)
self._multitasks.append(labeller)
def __reduce__(self):
return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None)
@ -705,13 +796,13 @@ cdef class NeuralEntityRecognizer(NeuralParser):
nr_feature = 6
def predict_confidences(self, docs):
tensors = [d.tensor for d in docs]
samples = []
for i in range(10):
states = self.parse_batch(docs, tensors, drop=0.3)
for state in states:
samples.append(self._get_entities(state))
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
for target in []:
labeller = NeuralLabeller(self.vocab, target=target)
tok2vec = self.model[0]
labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
pipeline.append(labeller)
self._multitasks.append(labeller)
def __reduce__(self):
return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None)

View File

@ -1,4 +1,4 @@
cpdef enum symbol_t:
cdef enum symbol_t:
NIL
IS_ALPHA
IS_ASCII

View File

@ -1,4 +1,6 @@
# coding: utf8
#cython: optimize.unpack_method_calls=False
from __future__ import unicode_literals
IDS = {
@ -458,4 +460,11 @@ IDS = {
"xcomp": xcomp
}
NAMES = [it[0] for it in sorted(IDS.items(), key=lambda it: it[1])]
def sort_nums(x):
return x[1]
NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)]
# Unfortunate hack here, to work around problem with long cpdef enum
# (which is generating an enormous amount of C++ in Cython 0.24+)
# We keep the enum cdef, and just make sure the names are available to Python
locals().update(IDS)

View File

@ -147,10 +147,10 @@ def get_token_ids(states, int n_tokens):
nr_update = 0
def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
states, tokvecs, golds,
states, golds,
state2vec, vec2scores,
int width, float density,
sgd=None, losses=None, drop=0.):
losses=None, drop=0.):
global nr_update
cdef MaxViolation violn
nr_update += 1

View File

@ -101,9 +101,10 @@ cdef cppclass StateC:
elif n == 6:
if this.B(0) >= 0:
ids[0] = this.B(0)
ids[1] = this.B(0)-1
else:
ids[0] = -1
ids[1] = this.B(0)
ids[1] = -1
ids[2] = this.B(1)
ids[3] = this.E(0)
if ids[3] >= 1:
@ -120,6 +121,8 @@ cdef cppclass StateC:
for i in range(n):
if ids[i] >= 0:
ids[i] += this.offset
else:
ids[i] = -1
int S(int i) nogil const:
if i >= this._s_i:
@ -162,9 +165,9 @@ cdef cppclass StateC:
int E(int i) nogil const:
if this._e_i <= 0 or this._e_i >= this.length:
return 0
return -1
if i < 0 or i >= this._e_i:
return 0
return -1
return this._ents[this._e_i - (i+1)].start
int L(int i, int idx) nogil const:

View File

@ -161,8 +161,7 @@ cdef class BiluoPushDown(TransitionSystem):
cdef Transition lookup_transition(self, object name) except *:
cdef attr_t label
if name == '-' or name == None:
move_str = 'M'
label = 0
return Transition(clas=0, move=MISSING, label=0, score=0)
elif name == '!O':
return Transition(clas=0, move=ISNT, label=0, score=0)
elif '-' in name:
@ -220,6 +219,31 @@ cdef class BiluoPushDown(TransitionSystem):
raise Exception(move)
return t
#def add_action(self, int action, label_name):
# cdef attr_t label_id
# if not isinstance(label_name, (int, long)):
# label_id = self.strings.add(label_name)
# else:
# label_id = label_name
# if action == OUT and label_id != 0:
# return
# if action == MISSING or action == ISNT:
# return
# # Check we're not creating a move we already have, so that this is
# # idempotent
# for trans in self.c[:self.n_moves]:
# if trans.move == action and trans.label == label_id:
# return 0
# if self.n_moves >= self._size:
# self._size *= 2
# self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0]))
# self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
# assert self.c[self.n_moves].label == label_id
# self.n_moves += 1
# return 1
cdef int initialize_state(self, StateC* st) nogil:
# This is especially necessary when we use limited training data.
for i in range(st.length):

View File

@ -13,6 +13,7 @@ cdef class Parser:
cdef public object model
cdef readonly TransitionSystem moves
cdef readonly object cfg
cdef public object _multitasks
cdef void _parse_step(self, StateC* state,
const float* feat_weights,

View File

@ -7,6 +7,7 @@ from __future__ import unicode_literals, print_function
from collections import Counter, OrderedDict
import ujson
import json
import contextlib
from libc.math cimport exp
@ -37,10 +38,9 @@ from preshed.maps cimport MapStruct
from preshed.maps cimport map_get
from thinc.api import layerize, chain, noop, clone, with_flatten
from thinc.neural import Model, Affine, ReLu, Maxout
from thinc.neural._classes.batchnorm import BatchNorm as BN
from thinc.neural._classes.selu import SELU
from thinc.neural._classes.layernorm import LayerNorm
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
from thinc.misc import LayerNorm
from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.util import get_array_module
@ -48,7 +48,8 @@ from .. import util
from ..util import get_async, get_cuda_stream
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune
from .._ml import Residual, drop_layer
from .._ml import Residual, drop_layer, flatten
from .._ml import link_vectors_to_models
from ..compat import json_dumps
from . import _parse_features
@ -238,14 +239,15 @@ cdef class Parser:
Base class of the DependencyParser and EntityRecognizer.
"""
@classmethod
def Model(cls, nr_class, token_vector_width=128, hidden_width=300, depth=1, **cfg):
def Model(cls, nr_class, token_vector_width=128, hidden_width=200, depth=1, **cfg):
depth = util.env_opt('parser_hidden_depth', depth)
token_vector_width = util.env_opt('token_vector_width', token_vector_width)
hidden_width = util.env_opt('hidden_width', hidden_width)
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
embed_size = util.env_opt('embed_size', 4000)
tensors = fine_tune(Tok2Vec(token_vector_width, embed_size,
preprocess=doc2feats()))
embed_size = util.env_opt('embed_size', 7000)
tok2vec = Tok2Vec(token_vector_width, embed_size,
pretrained_dims=cfg.get('pretrained_dims', 0))
tok2vec = chain(tok2vec, flatten)
if parser_maxout_pieces == 1:
lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
nF=cls.nr_feature,
@ -262,8 +264,8 @@ cdef class Parser:
upper.is_noop = True
else:
upper = chain(
clone(Maxout(hidden_width), (depth-1)),
zero_init(Affine(nr_class, drop_factor=0.0))
clone(Maxout(hidden_width), depth-1),
zero_init(Affine(nr_class, hidden_width, drop_factor=0.0))
)
upper.is_noop = False
# TODO: This is an unfortunate hack atm!
@ -277,7 +279,7 @@ cdef class Parser:
'hidden_width': hidden_width,
'maxout_pieces': parser_maxout_pieces
}
return (tensors, lower, upper), cfg
return (tok2vec, lower, upper), cfg
def __init__(self, Vocab vocab, moves=True, model=True, **cfg):
"""
@ -307,12 +309,16 @@ cdef class Parser:
cfg['beam_width'] = util.env_opt('beam_width', 1)
if 'beam_density' not in cfg:
cfg['beam_density'] = util.env_opt('beam_density', 0.0)
if 'pretrained_dims' not in cfg:
cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
cfg.setdefault('cnn_maxout_pieces', 3)
self.cfg = cfg
if 'actions' in self.cfg:
for action, labels in self.cfg.get('actions', {}).items():
for label in labels:
self.moves.add_action(action, label)
self.model = model
self._multitasks = []
def __reduce__(self):
return (Parser, (self.vocab, self.moves, self.model), None, None)
@ -332,11 +338,11 @@ cdef class Parser:
beam_density = self.cfg.get('beam_density', 0.0)
cdef Beam beam
if beam_width == 1:
states = self.parse_batch([doc], [doc.tensor])
states = self.parse_batch([doc])
self.set_annotations([doc], states)
return doc
else:
beam = self.beam_parse([doc], [doc.tensor],
beam = self.beam_parse([doc],
beam_width=beam_width, beam_density=beam_density)[0]
output = self.moves.get_beam_annot(beam)
state = <StateClass>beam.at(0)
@ -365,11 +371,11 @@ cdef class Parser:
cdef Beam beam
for docs in cytoolz.partition_all(batch_size, docs):
docs = list(docs)
tokvecs = [doc.tensor for doc in docs]
if beam_width == 1:
parse_states = self.parse_batch(docs, tokvecs)
parse_states = self.parse_batch(docs)
beams = []
else:
beams = self.beam_parse(docs, tokvecs,
beams = self.beam_parse(docs,
beam_width=beam_width, beam_density=beam_density)
parse_states = []
for beam in beams:
@ -377,7 +383,7 @@ cdef class Parser:
self.set_annotations(docs, parse_states)
yield from docs
def parse_batch(self, docs, tokvecses):
def parse_batch(self, docs):
cdef:
precompute_hiddens state2vec
StateClass state
@ -388,21 +394,15 @@ cdef class Parser:
int nr_class, nr_feat, nr_piece, nr_dim, nr_state
if isinstance(docs, Doc):
docs = [docs]
if isinstance(tokvecses, np.ndarray):
tokvecses = [tokvecses]
tokvecs = self.model[0].ops.flatten(tokvecses)
if USE_FINE_TUNE:
tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
cuda_stream = get_cuda_stream()
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
0.0)
nr_state = len(docs)
nr_class = self.moves.n_moves
nr_dim = tokvecs.shape[1]
nr_feat = self.nr_feature
cuda_stream = get_cuda_stream()
state2vec, vec2scores = self.get_batch_model(nr_state, tokvecs,
cuda_stream, 0.0)
nr_piece = state2vec.nP
states = self.moves.init_batch(docs)
@ -418,21 +418,23 @@ cdef class Parser:
c_token_ids = <int*>token_ids.data
c_is_valid = <int*>is_valid.data
cdef int has_hidden = not getattr(vec2scores, 'is_noop', False)
cdef int nr_step
while not next_step.empty():
nr_step = next_step.size()
if not has_hidden:
for i in cython.parallel.prange(
next_step.size(), num_threads=6, nogil=True):
for i in cython.parallel.prange(nr_step, num_threads=6,
nogil=True):
self._parse_step(next_step[i],
feat_weights, nr_class, nr_feat, nr_piece)
else:
for i in range(next_step.size()):
for i in range(nr_step):
st = next_step[i]
st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat)
self.moves.set_valid(&c_is_valid[i*nr_class], st)
vectors = state2vec(token_ids[:next_step.size()])
scores = vec2scores(vectors)
c_scores = <float*>scores.data
for i in range(next_step.size()):
for i in range(nr_step):
st = next_step[i]
guess = arg_max_if_valid(
&c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class)
@ -445,18 +447,15 @@ cdef class Parser:
next_step.push_back(st)
return states
def beam_parse(self, docs, tokvecses, int beam_width=3, float beam_density=0.001):
def beam_parse(self, docs, int beam_width=3, float beam_density=0.001):
cdef Beam beam
cdef np.ndarray scores
cdef Doc doc
cdef int nr_class = self.moves.n_moves
cdef StateClass stcls, output
tokvecs = self.model[0].ops.flatten(tokvecses)
if USE_FINE_TUNE:
tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
cuda_stream = get_cuda_stream()
state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs,
cuda_stream, 0.0)
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
0.0)
beams = []
cdef int offset = 0
cdef int j = 0
@ -516,29 +515,24 @@ cdef class Parser:
free(scores)
free(token_ids)
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
def update(self, docs, golds, drop=0., sgd=None, losses=None):
if not any(self.moves.has_gold(gold) for gold in golds):
return None
if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5:
return self.update_beam(docs_tokvecs, golds,
return self.update_beam(docs, golds,
self.cfg['beam_width'], self.cfg['beam_density'],
drop=drop, sgd=sgd, losses=losses)
if losses is not None and self.name not in losses:
losses[self.name] = 0.
docs, tokvec_lists = docs_tokvecs
tokvecs = self.model[0].ops.flatten(tokvec_lists)
if isinstance(docs, Doc) and isinstance(golds, GoldParse):
docs = [docs]
golds = [golds]
if USE_FINE_TUNE:
my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
tokvecs = self.model[0].ops.flatten(my_tokvecs)
cuda_stream = get_cuda_stream()
states, golds, max_steps = self._init_gold_batch(docs, golds)
state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream,
0.0)
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
drop)
todo = [(s, g) for (s, g) in zip(states, golds)
if not s.is_final() and g is not None]
if not todo:
@ -582,13 +576,9 @@ cdef class Parser:
if n_steps >= max_steps:
break
self._make_updates(d_tokvecs,
backprops, sgd, cuda_stream)
d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
if USE_FINE_TUNE:
d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
return d_tokvecs
bp_tokvecs, backprops, sgd, cuda_stream)
def update_beam(self, docs_tokvecs, golds, width=None, density=None,
def update_beam(self, docs, golds, width=None, density=None,
drop=0., sgd=None, losses=None):
if not any(self.moves.has_gold(gold) for gold in golds):
return None
@ -600,26 +590,20 @@ cdef class Parser:
density = self.cfg.get('beam_density', 0.0)
if losses is not None and self.name not in losses:
losses[self.name] = 0.
docs, tokvecs = docs_tokvecs
lengths = [len(d) for d in docs]
assert min(lengths) >= 1
tokvecs = self.model[0].ops.flatten(tokvecs)
if USE_FINE_TUNE:
my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
tokvecs += self.model[0].ops.flatten(my_tokvecs)
states = self.moves.init_batch(docs)
for gold in golds:
self.moves.preprocess_gold(gold)
cuda_stream = get_cuda_stream()
state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, 0.0)
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, drop)
states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500,
states, tokvecs, golds,
states, golds,
state2vec, vec2scores,
width, density,
sgd=sgd, drop=drop, losses=losses)
drop=drop, losses=losses)
backprop_lower = []
cdef float batch_size = len(docs)
for i, d_scores in enumerate(states_d_scores):
@ -637,11 +621,7 @@ cdef class Parser:
else:
backprop_lower.append((ids, d_vector, bp_vectors))
d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream)
d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, lengths)
if USE_FINE_TUNE:
d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
return d_tokvecs
self._make_updates(d_tokvecs, bp_tokvecs, backprop_lower, sgd, cuda_stream)
def _init_gold_batch(self, whole_docs, whole_golds):
"""Make a square batch, of length equal to the shortest doc. A long
@ -679,7 +659,7 @@ cdef class Parser:
max_moves = max(max_moves, len(oracle_actions))
return states, golds, max_moves
def _make_updates(self, d_tokvecs, backprops, sgd, cuda_stream=None):
def _make_updates(self, d_tokvecs, bp_tokvecs, backprops, sgd, cuda_stream=None):
# Tells CUDA to block, so our async copies complete.
if cuda_stream is not None:
cuda_stream.synchronize()
@ -690,6 +670,7 @@ cdef class Parser:
d_state_features *= mask.reshape(ids.shape + (1,))
self.model[0].ops.scatter_add(d_tokvecs, ids * mask,
d_state_features)
bp_tokvecs(d_tokvecs, sgd=sgd)
@property
def move_names(self):
@ -699,11 +680,12 @@ cdef class Parser:
names.append(name)
return names
def get_batch_model(self, batch_size, tokvecs, stream, dropout):
_, lower, upper = self.model
state2vec = precompute_hiddens(batch_size, tokvecs,
lower, stream, drop=dropout)
return state2vec, upper
def get_batch_model(self, docs, stream, dropout):
tok2vec, lower, upper = self.model
tokvecs, bp_tokvecs = tok2vec.begin_update(docs, drop=dropout)
state2vec = precompute_hiddens(len(docs), tokvecs,
lower, stream, drop=0.0)
return (tokvecs, bp_tokvecs), state2vec, upper
nr_feature = 8
@ -766,7 +748,7 @@ cdef class Parser:
# order, or the model goes out of synch
self.cfg.setdefault('extra_labels', []).append(label)
def begin_training(self, gold_tuples, **cfg):
def begin_training(self, gold_tuples, pipeline=None, **cfg):
if 'model' in cfg:
self.model = cfg['model']
gold_tuples = nonproj.preprocess_training_data(gold_tuples)
@ -775,9 +757,22 @@ cdef class Parser:
for label in labels:
self.moves.add_action(action, label)
if self.model is True:
cfg['pretrained_dims'] = self.vocab.vectors_length
self.model, cfg = self.Model(self.moves.n_moves, **cfg)
self.init_multitask_objectives(gold_tuples, pipeline, **cfg)
link_vectors_to_models(self.vocab)
self.cfg.update(cfg)
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
'''Setup models for secondary objectives, to benefit from multi-task
learning. This method is intended to be overridden by subclasses.
For instance, the dependency parser can benefit from sharing
an input representation with a label prediction model. These auxiliary
models are discarded after training.
'''
pass
def preprocess_gold(self, docs_golds):
for doc, gold in docs_golds:
yield doc, gold
@ -813,6 +808,7 @@ cdef class Parser:
if 'model' not in exclude:
path = util.ensure_path(path)
if self.model is True:
self.cfg['pretrained_dims'] = self.vocab.vectors_length
self.model, cfg = self.Model(**self.cfg)
else:
cfg = {}
@ -835,7 +831,7 @@ cdef class Parser:
('upper_model', lambda: self.model[2].to_bytes()),
('vocab', lambda: self.vocab.to_bytes()),
('moves', lambda: self.moves.to_bytes(strings=False)),
('cfg', lambda: ujson.dumps(self.cfg))
('cfg', lambda: json.dumps(self.cfg, indent=2, sort_keys=True))
))
if 'model' in exclude:
exclude['tok2vec_model'] = True
@ -848,7 +844,7 @@ cdef class Parser:
deserializers = OrderedDict((
('vocab', lambda b: self.vocab.from_bytes(b)),
('moves', lambda b: self.moves.from_bytes(b, strings=False)),
('cfg', lambda b: self.cfg.update(ujson.loads(b))),
('cfg', lambda b: self.cfg.update(json.loads(b))),
('tok2vec_model', lambda b: None),
('lower_model', lambda b: None),
('upper_model', lambda b: None)
@ -856,9 +852,11 @@ cdef class Parser:
msg = util.from_bytes(bytes_data, deserializers, exclude)
if 'model' not in exclude:
if self.model is True:
self.model, cfg = self.Model(self.moves.n_moves)
self.model, cfg = self.Model(**self.cfg)
cfg['pretrained_dims'] = self.vocab.vectors_length
else:
cfg = {}
cfg['pretrained_dims'] = self.vocab.vectors_length
if 'tok2vec_model' in msg:
self.model[0].from_bytes(msg['tok2vec_model'])
if 'lower_model' in msg:

View File

@ -148,7 +148,7 @@ cdef class TransitionSystem:
def add_action(self, int action, label_name):
cdef attr_t label_id
if not isinstance(label_name, int):
if not isinstance(label_name, (int, long)):
label_id = self.strings.add(label_name)
else:
label_id = label_name

View File

@ -12,7 +12,7 @@ from .. import util
_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'id',
'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx']
'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'th','xx']
_models = {'en': ['en_core_web_sm'],
'de': ['de_core_news_md'],
'fr': ['fr_depvec_web_lg'],
@ -108,6 +108,11 @@ def he_tokenizer():
def nb_tokenizer():
return util.get_lang_class('nb').Defaults.create_tokenizer()
@pytest.fixture
def th_tokenizer():
pythainlp = pytest.importorskip("pythainlp")
return util.get_lang_class('th').Defaults.create_tokenizer()
@pytest.fixture
def stringstore():

View File

@ -67,12 +67,6 @@ def test_tokenizer_splits_uneven_wrap_interact(de_tokenizer, text):
assert len(tokens) == 4
@pytest.mark.parametrize('text', ["blau-rot"])
def test_tokenizer_splits_hyphens(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"])
def test_tokenizer_splits_numeric_range(de_tokenizer, text):
tokens = de_tokenizer(text)
@ -100,17 +94,21 @@ def test_tokenizer_splits_ellipsis_infix(de_tokenizer, text):
assert len(tokens) == 3
@pytest.mark.parametrize('text', ['Islam-Konferenz', 'Ost-West-Konflikt'])
def test_tokenizer_keeps_hyphens(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 1
def test_tokenizer_splits_double_hyphen_infix(de_tokenizer):
tokens = de_tokenizer("Viele Regeln--wie die Bindestrich-Regeln--sind kompliziert.")
assert len(tokens) == 12
assert len(tokens) == 10
assert tokens[0].text == "Viele"
assert tokens[1].text == "Regeln"
assert tokens[2].text == "--"
assert tokens[3].text == "wie"
assert tokens[4].text == "die"
assert tokens[5].text == "Bindestrich"
assert tokens[6].text == "-"
assert tokens[7].text == "Regeln"
assert tokens[8].text == "--"
assert tokens[9].text == "sind"
assert tokens[10].text == "kompliziert"
assert tokens[5].text == "Bindestrich-Regeln"
assert tokens[6].text == "--"
assert tokens[7].text == "sind"
assert tokens[8].text == "kompliziert"

View File

@ -25,15 +25,15 @@ Umfang kläglich dünnen Beine flimmerten ihm hilflos vor den Augen.
assert len(tokens) == 109
@pytest.mark.parametrize('text,length', [
("Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten", 1),
("Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz", 1),
("Kraftfahrzeug-Haftpflichtversicherung", 3),
("Vakuum-Mittelfrequenz-Induktionsofen", 5)
@pytest.mark.parametrize('text', [
"Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten",
"Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz",
"Kraftfahrzeug-Haftpflichtversicherung",
"Vakuum-Mittelfrequenz-Induktionsofen"
])
def test_tokenizer_handles_long_words(de_tokenizer, text, length):
def test_tokenizer_handles_long_words(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == length
assert len(tokens) == 1
@pytest.mark.parametrize('text,length', [

View File

View File

@ -0,0 +1,13 @@
# coding: utf8
from __future__ import unicode_literals
import pytest
TOKENIZER_TESTS = [
("คุณรักผมไหม", ['คุณ', 'รัก', 'ผม', 'ไหม'])
]
@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS)
def test_thai_tokenizer(th_tokenizer, text, expected_tokens):
tokens = [token.text for token in th_tokenizer(text)]
assert tokens == expected_tokens

View File

@ -26,7 +26,7 @@ def arc_eager(vocab):
@pytest.fixture
def tok2vec():
return Tok2Vec(8, 100, preprocess=doc2feats())
return Tok2Vec(8, 100)
@pytest.fixture
@ -61,33 +61,22 @@ def test_predict_doc(parser, tok2vec, model, doc):
parser(doc)
def test_update_doc(parser, tok2vec, model, doc, gold):
def test_update_doc(parser, model, doc, gold):
parser.model = model
tokvecs, bp_tokvecs = tok2vec.begin_update([doc])
d_tokvecs = parser.update(([doc], tokvecs), [gold])
assert d_tokvecs[0].shape == tokvecs[0].shape
def optimize(weights, gradient, key=None):
weights -= 0.001 * gradient
bp_tokvecs(d_tokvecs, sgd=optimize)
assert d_tokvecs[0].sum() == 0.
parser.update([doc], [gold], sgd=optimize)
def test_predict_doc_beam(parser, tok2vec, model, doc):
doc.tensor = tok2vec([doc])[0]
def test_predict_doc_beam(parser, model, doc):
parser.model = model
parser(doc, beam_width=32, beam_density=0.001)
for word in doc:
print(word.text, word.head, word.dep_)
def test_update_doc_beam(parser, tok2vec, model, doc, gold):
def test_update_doc_beam(parser, model, doc, gold):
parser.model = model
tokvecs, bp_tokvecs = tok2vec.begin_update([doc])
d_tokvecs = parser.update_beam(([doc], tokvecs), [gold])
assert d_tokvecs[0].shape == tokvecs[0].shape
def optimize(weights, gradient, key=None):
weights -= 0.001 * gradient
bp_tokvecs(d_tokvecs, sgd=optimize)
assert d_tokvecs[0].sum() == 0.
parser.update_beam([doc], [gold], sgd=optimize)

View File

@ -0,0 +1,8 @@
import pytest
@pytest.mark.models('en')
def test_issue1305(EN):
'''Test lemmatization of English VBZ'''
assert EN.vocab.morphology.lemmatizer('works', 'verb') == set(['work'])
doc = EN(u'This app works well')
assert doc[2].lemma_ == 'work'

View File

@ -0,0 +1,14 @@
from __future__ import unicode_literals
import pytest
from ...language import Language
def test_issue1380_empty_string():
nlp = Language()
doc = nlp('')
assert len(doc) == 0
@pytest.mark.models('en')
def test_issue1380_en(EN):
doc = EN('')
assert len(doc) == 0

View File

@ -9,11 +9,14 @@ import pytest
@pytest.mark.models('en')
def test_issue429(EN):
def merge_phrases(matcher, doc, i, matches):
if i != len(matches) - 1:
return None
spans = [(ent_id, ent_id, doc[start:end]) for ent_id, start, end in matches]
for ent_id, label, span in spans:
span.merge('NNP' if label else span.root.tag_, span.text, EN.vocab.strings[label])
if i != len(matches) - 1:
return None
spans = [(ent_id, ent_id, doc[start:end]) for ent_id, start, end in matches]
for ent_id, label, span in spans:
span.merge(
tag=('NNP' if label else span.root.tag_),
lemma=span.text,
label='PERSON')
doc = EN('a')
matcher = Matcher(EN.vocab)

View File

@ -11,7 +11,7 @@ import pytest
def taggers(en_vocab):
tagger1 = Tagger(en_vocab)
tagger2 = Tagger(en_vocab)
tagger1.model = tagger1.Model(8, 8)
tagger1.model = tagger1.Model(8)
tagger2.model = tagger1.model
return (tagger1, tagger2)

View File

@ -6,6 +6,16 @@ from ...strings import StringStore
import pytest
def test_string_hash(stringstore):
'''Test that string hashing is stable across platforms'''
ss = stringstore
assert ss.add('apple') == 8566208034543834098
heart = '\U0001f499'
print(heart)
h = ss.add(heart)
assert h == 11841826740069053588
def test_stringstore_from_api_docs(stringstore):
apple_hash = stringstore.add('apple')
assert apple_hash == 8566208034543834098

View File

@ -34,7 +34,6 @@ def test_matcher_from_api_docs(en_vocab):
assert len(patterns[0])
@pytest.mark.xfail
def test_matcher_from_usage_docs(en_vocab):
text = "Wow 😀 This is really cool! 😂 😂"
doc = get_doc(en_vocab, words=text.split(' '))
@ -46,7 +45,8 @@ def test_matcher_from_usage_docs(en_vocab):
if doc.vocab.strings[match_id] == 'HAPPY':
doc.sentiment += 0.1
span = doc[start : end]
token = span.merge(norm='happy emoji')
token = span.merge()
token.vocab[token.text].norm_ = 'happy emoji'
matcher = Matcher(en_vocab)
matcher.add('HAPPY', label_sentiment, *pos_patterns)
@ -98,11 +98,11 @@ def test_matcher_match_multi(matcher):
(doc.vocab.strings['Java'], 5, 6)]
@pytest.mark.xfail
def test_matcher_phrase_matcher(en_vocab):
words = ["Google", "Now"]
doc = get_doc(en_vocab, words)
matcher = PhraseMatcher(en_vocab, [doc])
matcher = PhraseMatcher(en_vocab)
matcher.add('COMPANY', None, doc)
words = ["I", "like", "Google", "Now", "best"]
doc = get_doc(en_vocab, words)
assert len(matcher(doc)) == 1

View File

@ -9,7 +9,8 @@ from .util import get_doc
from pathlib import Path
import pytest
from thinc.neural import Maxout, Softmax
from thinc.neural._classes.maxout import Maxout
from thinc.neural._classes.softmax import Softmax
from thinc.api import chain

View File

@ -1,6 +1,7 @@
# coding: utf-8
from __future__ import unicode_literals
import sys
import pytest
@ -37,9 +38,10 @@ def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
tokens = tokenizer(text)
assert len(tokens) == length
@pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8),
('i💙you', 3), ('🤘🤘yay!', 4)])
def test_tokenizer_handles_emoji(tokenizer, text, length):
tokens = tokenizer(text)
assert len(tokens) == length
# These break on narrow unicode builds, e.g. Windows
if sys.maxunicode >= 1114111:
tokens = tokenizer(text)
assert len(tokens) == length

View File

@ -54,7 +54,7 @@ cdef class Doc:
cdef public object noun_chunks_iterator
cdef int push_back(self, LexemeOrToken lex_or_tok, bint trailing_space) except -1
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1
cpdef np.ndarray to_array(self, object features)

View File

@ -660,7 +660,7 @@ cdef class Doc:
"""
with path.open('rb') as file_:
bytes_data = file_.read()
self.from_bytes(bytes_data, **exclude)
return self.from_bytes(bytes_data, **exclude)
def to_bytes(self, **exclude):
"""Serialize, i.e. export the document contents to a binary string.

View File

@ -3,7 +3,7 @@ from __future__ import unicode_literals, print_function
import os
import ujson
import pip
import pkg_resources
import importlib
import regex as re
from pathlib import Path
@ -14,6 +14,7 @@ import numpy
import io
import dill
from collections import OrderedDict
from thinc.neural._classes.model import Model
import msgpack
import msgpack_numpy
@ -180,9 +181,10 @@ def is_package(name):
name (unicode): Name of package.
RETURNS (bool): True if installed package, False if not.
"""
packages = pip.get_installed_distributions()
name = name.lower() # compare package name against lowercase name
packages = pkg_resources.working_set.by_key.keys()
for package in packages:
if package.project_name.replace('-', '_') == name:
if package.lower().replace('-', '_') == name:
return True
return False
@ -193,6 +195,7 @@ def get_package_path(name):
name (unicode): Package name.
RETURNS (Path): Path to installed package.
"""
name = name.lower() # use lowercase version to be safe
# Here we're importing the module just to find it. This is worryingly
# indirect, but it's otherwise very difficult to find the package.
pkg = importlib.import_module(name)
@ -557,3 +560,17 @@ def minify_html(html):
RETURNS (unicode): "Minified" HTML.
"""
return html.strip().replace(' ', '').replace('\n', '')
def use_gpu(gpu_id):
try:
import cupy.cuda.device
except ImportError:
return None
from thinc.neural.ops import CupyOps
device = cupy.cuda.device.Device(gpu_id)
device.use()
Model.ops = CupyOps()
Model.Ops = CupyOps
return device

View File

@ -6,6 +6,8 @@ import msgpack
import msgpack_numpy
msgpack_numpy.patch()
cimport numpy as np
from thinc.neural.util import get_array_module
from thinc.neural._classes.model import Model
from .typedefs cimport attr_t
from .strings cimport StringStore
@ -14,15 +16,29 @@ from .compat import basestring_
cdef class Vectors:
'''Store, save and load word vectors.'''
'''Store, save and load word vectors.
Vectors data is kept in the vectors.data attribute, which should be an
instance of numpy.ndarray (for CPU vectors)
or cupy.ndarray (for GPU vectors).
vectors.key2row is a dictionary mapping word hashes to rows
in the vectors.data table. The array `vectors.keys` keeps
the keys in order, such that keys[vectors.key2row[key]] == key.
'''
cdef public object data
cdef readonly StringStore strings
cdef public object key2row
cdef public object keys
cdef public int i
def __init__(self, strings, data_or_width):
self.strings = StringStore()
def __init__(self, strings, data_or_width=0):
if isinstance(strings, StringStore):
self.strings = strings
else:
self.strings = StringStore()
for string in strings:
self.strings.add(string)
if isinstance(data_or_width, int):
self.data = data = numpy.zeros((len(strings), data_or_width),
dtype='f')
@ -31,12 +47,17 @@ cdef class Vectors:
self.i = 0
self.data = data
self.key2row = {}
self.keys = np.ndarray((self.data.shape[0],), dtype='uint64')
self.keys = np.ndarray((self.data.shape[0],), dtype='uint64')
def __reduce__(self):
return (Vectors, (self.strings, self.data))
def __getitem__(self, key):
'''Get a vector by key. If key is a string, it is hashed
to an integer ID using the vectors.strings table.
If the integer key is not found in the table, a KeyError is raised.
'''
if isinstance(key, basestring):
key = self.strings[key]
i = self.key2row[key]
@ -46,23 +67,30 @@ cdef class Vectors:
return self.data[i]
def __setitem__(self, key, vector):
'''Set a vector for the given key. If key is a string, it is hashed
to an integer ID using the vectors.strings table.
'''
if isinstance(key, basestring):
key = self.strings.add(key)
i = self.key2row[key]
self.data[i] = vector
def __iter__(self):
'''Yield vectors from the table.'''
yield from self.data
def __len__(self):
'''Return the number of vectors that have been assigned.'''
return self.i
def __contains__(self, key):
'''Check whether a key has a vector entry in the table.'''
if isinstance(key, basestring_):
key = self.strings[key]
return key in self.key2row
def add(self, key, vector=None):
'''Add a key to the table, optionally setting a vector value as well.'''
if isinstance(key, basestring_):
key = self.strings.add(key)
if key not in self.key2row:
@ -80,7 +108,9 @@ cdef class Vectors:
return i
def items(self):
for i, string in enumerate(self.strings):
'''Iterate over (string key, vector) pairs, in order.'''
for i, key in enumerate(self.keys):
string = self.strings[key]
yield string, self.data[i]
@property
@ -118,9 +148,14 @@ cdef class Vectors:
self.data
def to_disk(self, path, **exclude):
xp = get_array_module(self.data)
if xp is numpy:
save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False)
else:
save_array = lambda arr, file_: xp.save(file_, arr)
serializers = OrderedDict((
('vectors', lambda p: numpy.save(p.open('wb'), self.data, allow_pickle=False)),
('keys', lambda p: numpy.save(p.open('wb'), self.keys, allow_pickle=False)),
('vectors', lambda p: save_array(self.data, p.open('wb'))),
('keys', lambda p: xp.save(p.open('wb'), self.keys))
))
return util.to_disk(path, serializers, exclude)
@ -133,8 +168,9 @@ cdef class Vectors:
self.key2row[key] = i
def load_vectors(path):
xp = Model.ops.xp
if path.exists():
self.data = numpy.load(path)
self.data = xp.load(path)
serializers = OrderedDict((
('keys', load_keys),

View File

@ -27,6 +27,7 @@ from .vectors import Vectors
from . import util
from . import attrs
from . import symbols
from ._ml import link_vectors_to_models
cdef class Vocab:
@ -65,7 +66,7 @@ cdef class Vocab:
self.strings.add(name)
self.lex_attr_getters = lex_attr_getters
self.morphology = Morphology(self.strings, tag_map, lemmatizer)
self.vectors = Vectors(self.strings, 300)
self.vectors = Vectors(self.strings)
property lang:
def __get__(self):
@ -261,7 +262,7 @@ cdef class Vocab:
Words can be looked up by string or int ID.
RETURNS:
A word vector. Size and shape determed by the
A word vector. Size and shape determined by the
vocab.vectors instance. Usually, a numpy ndarray
of shape (300,) and dtype float32.
@ -323,6 +324,7 @@ cdef class Vocab:
self.lexemes_from_bytes(file_.read())
if self.vectors is not None:
self.vectors.from_disk(path, exclude='strings.json')
link_vectors_to_models(self)
return self
def to_bytes(self, **exclude):
@ -336,7 +338,7 @@ cdef class Vocab:
return None
else:
return self.vectors.to_bytes(exclude='strings.json')
getters = OrderedDict((
('strings', lambda: self.strings.to_bytes()),
('lexemes', lambda: self.lexemes_to_bytes()),
@ -436,6 +438,7 @@ def unpickle_vocab(sstore, morphology, data_dir,
vocab.lex_attr_getters = lex_attr_getters
vocab.lexemes_from_bytes(lexemes_data)
vocab.length = length
link_vectors_to_models(vocab)
return vocab

View File

@ -17,6 +17,7 @@ fi
if [ "${VIA}" == "compile" ]; then
pip install -r requirements.txt
python setup.py build_ext --inplace
pip install -e .
fi

View File

@ -8,4 +8,5 @@ include _includes/_mixins
| does not exist!
h2.c-landing__title.u-heading-3.u-padding-small
a(href="javascript:history.go(-1)") Click here to go back.
+button(false, true, "secondary-light")(href="javascript:history.go(-1)")
| Click here to go back

View File

@ -3,24 +3,22 @@
"landing": true,
"logos": [
{
"quora": [ "https://www.quora.com", 150 ],
"chartbeat": [ "https://chartbeat.com", 200 ],
"duedil": [ "https://www.duedil.com", 150 ],
"stitchfix": [ "https://www.stitchfix.com", 190 ]
"airbnb": [ "https://www.airbnb.com", 150, 45],
"quora": [ "https://www.quora.com", 120, 34 ],
"retriever": [ "https://www.retriever.no", 150, 33 ],
"stitchfix": [ "https://www.stitchfix.com", 150, 18 ]
},
{
"wayblazer": [ "http://wayblazer.com", 200 ],
"indico": [ "https://indico.io", 150 ],
"chattermill": [ "https://chattermill.io", 175 ],
"turi": [ "https://turi.com", 150 ],
"kip": [ "http://kipthis.com", 70 ]
},
"chartbeat": [ "https://chartbeat.com", 180, 25 ],
"allenai": [ "https://allenai.org", 220, 37 ]
}
],
"features": [
{
"socrata": [ "https://www.socrata.com", 150 ],
"cytora": [ "http://www.cytora.com", 125 ],
"signaln": [ "http://signaln.com", 150 ],
"wonderflow": [ "http://www.wonderflow.co", 200 ],
"synapsify": [ "http://www.gosynapsify.com", 150 ]
"thoughtworks": ["https://www.thoughtworks.com/radar/tools", 150, 28],
"wapo": ["https://www.washingtonpost.com/news/wonk/wp/2016/05/18/googles-new-artificial-intelligence-cant-understand-these-sentences-can-you/", 100, 77],
"venturebeat": ["https://venturebeat.com/2017/01/27/4-ai-startups-that-analyze-customer-reviews/", 150, 19],
"microsoft": ["https://www.microsoft.com/developerblog/2016/09/13/training-a-classifier-for-relation-extraction-from-medical-literature/", 130, 28]
}
]
},
@ -34,7 +32,24 @@
"landing": true
},
"announcement" : {
"title": "Important Announcement"
"styleguide": {
"title": "Styleguide",
"sidebar": {
"Styleguide": { "": "styleguide" },
"Resources": {
"Website Source": "https://github.com/explosion/spacy/tree/master/website",
"Contributing Guide": "https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md"
}
},
"menu": {
"Introduction": "intro",
"Logo": "logo",
"Colors": "colors",
"Typography": "typography",
"Elements": "elements",
"Components": "components",
"Embeds": "embeds",
"Markup Reference": "markup"
}
}
}

View File

@ -11,12 +11,9 @@
"COMPANY": "Explosion AI",
"COMPANY_URL": "https://explosion.ai",
"DEMOS_URL": "https://demos.explosion.ai",
"MODELS_REPO": "explosion/spacy-models",
"SPACY_VERSION": "1.8",
"LATEST_NEWS": {
"url": "https://github.com/explosion/spaCy/releases/tag/v2.0.0-alpha",
"title": "Test spaCy v2.0.0 alpha!"
},
"SPACY_VERSION": "2.0",
"SOCIAL": {
"twitter": "spacy_io",
@ -27,25 +24,23 @@
},
"NAVIGATION": {
"Home": "/",
"Usage": "/docs/usage",
"Reference": "/docs/api",
"Demos": "/docs/usage/showcase",
"Blog": "https://explosion.ai/blog"
"Usage": "/usage",
"Models": "/models",
"API": "/api"
},
"FOOTER": {
"spaCy": {
"Usage": "/docs/usage",
"API Reference": "/docs/api",
"Tutorials": "/docs/usage/tutorials",
"Showcase": "/docs/usage/showcase"
"Usage": "/usage",
"Models": "/models",
"API Reference": "/api",
"Resources": "/usage/resources"
},
"Support": {
"Issue Tracker": "https://github.com/explosion/spaCy/issues",
"StackOverflow": "http://stackoverflow.com/questions/tagged/spacy",
"Reddit usergroup": "https://www.reddit.com/r/spacynlp/",
"Gitter chat": "https://gitter.im/explosion/spaCy"
"Reddit Usergroup": "https://www.reddit.com/r/spacynlp/",
"Gitter Chat": "https://gitter.im/explosion/spaCy"
},
"Connect": {
"Twitter": "https://twitter.com/spacy_io",
@ -74,21 +69,11 @@
{"id": "venv", "title": "virtualenv", "help": "Use a virtual environment and install spaCy into a user directory" },
{"id": "gpu", "title": "GPU", "help": "Run spaCy on GPU to make it faster. Requires an NVDIA graphics card with CUDA 2+. See section below for more info."}]
},
{ "id": "model", "title": "Models", "multiple": true, "options": [
{ "id": "en", "title": "English", "meta": "50MB" },
{ "id": "de", "title": "German", "meta": "645MB" },
{ "id": "fr", "title": "French", "meta": "1.33GB" },
{ "id": "es", "title": "Spanish", "meta": "377MB"}]
}
{ "id": "model", "title": "Models", "multiple": true }
],
"QUICKSTART_MODELS": [
{ "id": "lang", "title": "Language", "options": [
{ "id": "en", "title": "English", "checked": true },
{ "id": "de", "title": "German" },
{ "id": "fr", "title": "French" },
{ "id": "es", "title": "Spanish" }]
},
{ "id": "lang", "title": "Language"},
{ "id": "load", "title": "Loading style", "options": [
{ "id": "spacy", "title": "Use spacy.load()", "checked": true, "help": "Use spaCy's built-in loader to load the model by name." },
{ "id": "module", "title": "Import as module", "help": "Import the model explicitly as a Python module." }]
@ -98,50 +83,15 @@
}
],
"MODELS": {
"en": [
{ "id": "en_core_web_sm", "lang": "English", "feats": [1, 1, 1, 1], "size": "50 MB", "license": "CC BY-SA", "def": true },
{ "id": "en_core_web_md", "lang": "English", "feats": [1, 1, 1, 1], "size": "1 GB", "license": "CC BY-SA" },
{ "id": "en_depent_web_md", "lang": "English", "feats": [1, 1, 1, 0], "size": "328 MB", "license": "CC BY-SA" },
{ "id": "en_vectors_glove_md", "lang": "English", "feats": [1, 0, 0, 1], "size": "727 MB", "license": "CC BY-SA" }
],
"de": [
{ "id": "de_core_news_md", "lang": "German", "feats": [1, 1, 1, 1], "size": "645 MB", "license": "CC BY-SA" }
],
"fr": [
{ "id": "fr_depvec_web_lg", "lang": "French", "feats": [1, 1, 0, 1], "size": "1.33 GB", "license": "CC BY-NC" }
],
"es": [
{ "id": "es_core_web_md", "lang": "Spanish", "feats": [1, 1, 1, 1], "size": "377 MB", "license": "CC BY-SA"}
]
},
"EXAMPLE_SENTENCES": {
"en": "This is a sentence.",
"de": "Dies ist ein Satz.",
"fr": "C'est une phrase.",
"es": "Esto es una frase."
},
"ALPHA": true,
"V_CSS": "1.6",
"V_JS": "1.2",
"V_CSS": "2.0",
"V_JS": "2.0",
"DEFAULT_SYNTAX": "python",
"ANALYTICS": "UA-58931649-1",
"MAILCHIMP": {
"user": "spacy.us12",
"id": "83b0498b1e7fa3c91ce68c3f1",
"list": "89ad33e698"
},
"BADGES": {
"pipy": {
"badge": "https://img.shields.io/pypi/v/spacy.svg?style=flat-square",
"link": "https://pypi.python.org/pypi/spacy"
},
"conda": {
"badge": "https://anaconda.org/conda-forge/spacy/badges/version.svg",
"link": "https://anaconda.org/conda-forge/spacy"
}
}
}
}

View File

@ -1,8 +1,6 @@
//- 💫 INCLUDES > FOOTER
include _mixins
footer.o-footer.u-text.u-border-dotted
footer.o-footer.u-text
+grid.o-content
each group, label in FOOTER
+grid-col("quarter")
@ -13,18 +11,18 @@ footer.o-footer.u-text.u-border-dotted
li
+a(url)=item
if SECTION != "docs"
if SECTION == "index"
+grid-col("quarter")
include _newsletter
if SECTION == "docs"
if SECTION != "index"
.o-content.o-block.u-border-dotted
include _newsletter
.o-inline-list.u-text-center.u-text-tiny.u-color-subtle
span &copy; 2016-#{new Date().getFullYear()} #[+a(COMPANY_URL, true)=COMPANY]
+a(COMPANY_URL, true)
+svg("graphics", "explosion", 45).o-icon.u-color-theme.u-grayscale
+a(COMPANY_URL, true)(aria-label="Explosion AI")
+icon("explosion", 45).o-icon.u-color-theme.u-grayscale
+a(COMPANY_URL + "/legal", true) Legal / Imprint

View File

@ -1,35 +1,71 @@
//- 💫 INCLUDES > FUNCTIONS
//- More descriptive variables for current.path and current.source
//- Descriptive variables, available in the global scope
- CURRENT = current.source
- SECTION = current.path[0]
- SUBSECTION = current.path[1]
- LANGUAGES = public.models._data.LANGUAGES
- MODELS = public.models._data.MODELS
- CURRENT_MODELS = MODELS[current.source] || []
- MODEL_COUNT = Object.keys(MODELS).map(m => Object.keys(MODELS[m]).length).reduce((a, b) => a + b)
- MODEL_LANG_COUNT = Object.keys(MODELS).length
- LANG_COUNT = Object.keys(LANGUAGES).length
- MODEL_META = public.models._data.MODEL_META
- MODEL_LICENSES = public.models._data.MODEL_LICENSES
- MODEL_ACCURACY = public.models._data.MODEL_ACCURACY
- EXAMPLE_SENTENCES = public.models._data.EXAMPLE_SENTENCES
- IS_PAGE = (SECTION != "index") && !landing
- IS_MODELS = (SECTION == "models" && LANGUAGES[current.source])
- HAS_MODELS = IS_MODELS && CURRENT_MODELS.length
//- Add prefixes to items of an array (for modifier CSS classes)
array - [array] list of class names or options, e.g. ["foot"]
prefix - [string] prefix to add to each class, e.g. "c-table__row"
RETURNS - [array] list of modified class names
- function prefixArgs(array, prefix) {
- return array.map(function(arg) {
- return prefix + '--' + arg;
- }).join(' ');
- return array.map(arg => prefix + '--' + arg).join(' ');
- }
//- Convert API paths (semi-temporary fix for renamed sections)
path - [string] link path supplied to +api mixin
RETURNS - [string] new link path to correct location
- function convertAPIPath(path) {
- if (path.startsWith('spacy#') || path.startsWith('displacy#') || path.startsWith('util#')) {
- var comps = path.split('#');
- return "top-level#" + comps[0] + '.' + comps[1];
- }
- else if (path.startsWith('cli#')) {
- return "top-level#" + path.split('#')[1];
- }
- return path;
- }
//- Get model components from ID. Components can then be looked up in LANGUAGES
and MODEL_META respectively, to get their human-readable form.
id - [string] model ID, e.g. "en_core_web_sm"
RETURNS - [object] object keyed by components lang, type, genre and size
- function getModelComponents(id) {
- var comps = id.split('_');
- return {'lang': comps[0], 'type': comps[1], 'genre': comps[2], 'size': comps[3]}
- }
//- Generate GitHub links
repo - [string] name of repo owned by explosion
filepath - [string] logical path to file relative to repository root
branch - [string] optional branch, defaults to "master"
RETURNS - [string] the correct link to the file on GitHub
- function gh(repo, filepath, branch) {
- var branch = ALPHA ? 'develop' : branch
- return 'https://github.com/' + SOCIAL.github + '/' + repo + (filepath ? '/blob/' + (branch || 'master') + '/' + filepath : '' );
- }
//- Get social images
- function getSocialImg() {
- var base = SITE_URL + '/assets/img/social/preview_'
- var image = ALPHA ? 'alpha' : 'default'
- if (preview) image = preview
- else if (SECTION == 'docs' && !ALPHA) image = 'docs'
- return base + image + '.jpg'
- return 'https://github.com/' + SOCIAL.github + '/' + (repo || '') + (filepath ? '/blob/' + (branch || 'master') + '/' + filepath : '' );
- }

View File

@ -1,5 +1,13 @@
//- 💫 MIXINS > BASE
//- Section
id - [string] anchor assigned to section (used for breadcrumb navigation)
mixin section(id)
section.o-section(id="section-" + id data-section=id)
block
//- Aside wrapper
label - [string] aside label
@ -11,34 +19,26 @@ mixin aside-wrapper(label)
block
//- Date
input - [string] date in the format YYYY-MM-DD
mixin date(input)
- var date = new Date(input)
- var months = [ 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December' ]
time(datetime=JSON.parse(JSON.stringify(date)))&attributes(attributes)=months[date.getMonth()] + ' ' + date.getDate() + ', ' + date.getFullYear()
//- SVG from map
file - [string] SVG file name in /assets/img/
//- SVG from map (uses embedded SVG sprite)
name - [string] SVG symbol id
width - [integer] width in px
height - [integer] height in px (default: same as width)
mixin svg(file, name, width, height)
mixin svg(name, width, height)
svg(aria-hidden="true" viewBox="0 0 #{width} #{height || width}" width=width height=(height || width))&attributes(attributes)
use(xlink:href="/assets/img/#{file}.svg##{name}")
use(xlink:href="#svg_#{name}")
//- Icon
name - [string] icon name, should be SVG symbol ID
size - [integer] icon width and height (default: 20)
name - [string] icon name (will be used as symbol id: #svg_{name})
width - [integer] icon width (default: 20)
height - [integer] icon height (defaults to width)
mixin icon(name, size)
- var size = size || 20
+svg("icons", name, size).o-icon(style="min-width: #{size}px")&attributes(attributes)
mixin icon(name, width, height)
- var width = width || 20
- var height = height || width
+svg(name, width, height).o-icon(style="min-width: #{width}px")&attributes(attributes)
//- Pro/Con/Neutral icon
@ -46,8 +46,8 @@ mixin icon(name, size)
size - [integer] icon size (optional)
mixin procon(icon, size)
- colors = { pro: "green", con: "red", neutral: "yellow" }
+icon(icon, size)(class="u-color-#{colors[icon] || 'subtle'}" aria-label=icon)&attributes(attributes)
- colors = { pro: "green", con: "red", neutral: "subtle" }
+icon("circle", size || 16)(class="u-color-#{colors[icon] || 'subtle'}" aria-label=icon)&attributes(attributes)
//- Headlines Helper Mixin
@ -80,8 +80,7 @@ mixin headline(level)
mixin permalink(id)
if id
a.u-permalink(id=id href="##{id}")
+icon("anchor").u-permalink__icon
a.u-permalink(href="##{id}")
block
else
@ -109,7 +108,7 @@ mixin quickstart(groups, headline, description, hide_results)
.c-quickstart__fields
for option in group.options
input.c-quickstart__input(class="c-quickstart__input--" + (group.input_style ? group.input_style : group.multiple ? "check" : "radio") type=group.multiple ? "checkbox" : "radio" name=group.id id="qs-#{option.id}" value=option.id checked=option.checked)
label.c-quickstart__label(for="qs-#{option.id}")!=option.title
label.c-quickstart__label.u-text-tiny(for="qs-#{option.id}")!=option.title
if option.meta
| #[span.c-quickstart__label__meta (#{option.meta})]
if option.help
@ -122,12 +121,10 @@ mixin quickstart(groups, headline, description, hide_results)
code.c-code-block__content.c-quickstart__code(data-qs-results="")
block
.c-quickstart__info.u-text-tiny.o-block.u-text-right
| Like this widget? Check out #[+a("https://github.com/ines/quickstart").u-link quickstart.js]!
//- Quickstart code item
data [object] - Rendering conditions (keyed by option group ID, value: option)
data - [object] Rendering conditions (keyed by option group ID, value: option)
style - [string] modifier ID for line style
mixin qs(data, style)
- args = {}
@ -148,6 +145,13 @@ mixin terminal(label)
+code.x-terminal__code
block
//- Chart.js
id - [string] chart ID, will be assigned as #chart_{id}
mixin chart(id)
figure.o-block&attributes(attributes)
canvas(id="chart_#{id}" width="800" height="400" style="max-width: 100%")
//- Gitter chat button and widget
button - [string] text shown on button
@ -156,26 +160,24 @@ mixin terminal(label)
mixin gitter(button, label)
aside.js-gitter.c-chat.is-collapsed(data-title=(label || button))
button.js-gitter-button.c-chat__button.u-text-small
+icon("chat").o-icon--inline
button.js-gitter-button.c-chat__button.u-text-tag
+icon("chat", 16).o-icon--inline
!=button
//- Badge
name - [string] "pipy" or "conda"
image - [string] path to badge image
url - [string] badge link
mixin badge(name)
- site = BADGES[name]
if site
+a(site.link).u-padding-small
img(src=site.badge alt="{name} version" height="20")
mixin badge(image, url)
+a(url).u-padding-small.u-hide-link&attributes(attributes)
img.o-badge(src=image alt=url height="20")
//- Logo
//- spaCy logo
mixin logo()
+svg("graphics", "spacy", 675, 215).o-logo&attributes(attributes)
+svg("spacy", 675, 215).o-logo&attributes(attributes)
//- Landing
@ -186,18 +188,56 @@ mixin landing-header()
.c-landing__content
block
mixin landing-banner(headline, label)
.c-landing__banner.u-padding.o-block.u-color-light
+grid.c-landing__banner__content.o-no-block
+grid-col("third")
h3.u-heading.u-heading-1
if label
div
span.u-text-label.u-text-label--light=label
!=headline
mixin landing-badge(url, graphic, alt, size)
+a(url)(aria-label=alt title=alt).c-landing__badge
+svg("graphics", graphic, size || 225)
+grid-col("two-thirds").c-landing__banner__text
block
mixin landing-logos(title, logos)
.o-content.u-text-center&attributes(attributes)
h3.u-heading.u-text-label.u-color-dark=title
each row, i in logos
- var is_last = i == logos.length - 1
+grid("center").o-inline-list.o-no-block(class=is_last ? "o-no-block" : null)
each details, name in row
+a(details[0]).u-padding-medium
+icon(name, details[1], details[2])
if is_last
block
//- Under construction (temporary)
Marks sections that still need to be completed for the v2.0 release.
mixin under-construction()
+infobox("🚧 Under construction")
+infobox("Under construction", "🚧")
| This section is still being written and will be updated for the v2.0
| release. Is there anything that you think should definitely mentioned or
| explained here? Any examples you'd like to see? #[strong Let us know]
| on the #[+a(gh("spacy") + "/issues/1105") v2.0 alpha thread] on GitHub!
//- Alpha infobox (temporary)
Added in the templates to notify user that they're visiting the alpha site.
mixin alpha-info()
+infobox("You are viewing the spaCy v2.0.0 alpha docs", "⚠️")
strong This page is part of the alpha documentation for spaCy v2.0.
| It does not reflect the state of the latest stable release.
| Because v2.0 is still under development, the implementation
| may differ from the intended state described here. See the
| #[+a(gh("spaCy") + "/releases/tag/v2.0.0-alpha") release notes]
| for details on how to install and test the new version. To
| read the official docs for spaCy v1.x,
| #[+a("https://spacy.io/docs") go here].

View File

@ -8,11 +8,15 @@ include _mixins-base
level - [integer] headline level, corresponds to h1, h2, h3 etc.
id - [string] unique identifier, creates permalink (optional)
mixin h(level, id)
+headline(level).u-heading&attributes(attributes)
mixin h(level, id, source)
+headline(level).u-heading(id=id)&attributes(attributes)
+permalink(id)
block
if source
+button(gh("spacy", source), false, "secondary", "small").u-nowrap.u-float-right
span Source #[+icon("code", 14).o-icon--inline]
//- External links
url - [string] link href
@ -38,21 +42,23 @@ mixin src(url)
//- API link (with added tag and automatically generated path)
path - [string] path to API docs page relative to /docs/api/
path - [string] path to API docs page relative to /api/
mixin api(path)
+a("/docs/api/" + path, true)(target="_self").u-no-border.u-inline-block.u-nowrap
- path = convertAPIPath(path)
+a("/api/" + path, true)(target="_self").u-no-border.u-inline-block.u-nowrap
block
| #[+icon("book", 18).o-icon--inline.u-color-theme]
| #[+icon("book", 16).o-icon--inline.u-color-theme]
//- Help icon with tooltip
tooltip - [string] Tooltip text
tooltip - [string] Tooltip text
icon_size - [integer] Optional size of help icon in px.
mixin help(tooltip)
mixin help(tooltip, icon_size)
span(data-tooltip=tooltip)&attributes(attributes)
+icon("help", 16).i-icon--inline
+icon("help", icon_size || 16).o-icon--inline
//- Aside for text
@ -68,24 +74,43 @@ mixin aside(label)
label - [string] aside title (optional or false for no label)
language - [string] language for syntax highlighting (default: "python")
supports basic relevant languages available for PrismJS
prompt - [string] prompt displayed before first line, e.g. "$"
mixin aside-code(label, language)
mixin aside-code(label, language, prompt)
+aside-wrapper(label)
+code(false, language).o-no-block
+code(false, language, prompt).o-no-block
block
//- Infobox
label - [string] infobox title (optional or false for no title)
emoji - [string] optional emoji displayed before the title, necessary as
argument to be able to wrap it for spacing
mixin infobox(label)
mixin infobox(label, emoji)
aside.o-box.o-block.u-text-small
if label
h3.u-text-label.u-color-theme=label
h3.u-heading.u-text-label.u-color-theme
if emoji
span.o-emoji=emoji
| #{label}
block
//- Logos displayed in the top corner of some infoboxes
logos - [array] List of icon ID, width, height and link.
mixin infobox-logos(...logos)
.o-box__logos.u-text-right.u-float-right
for logo in logos
if logo[3]
| #[+a(logo[3]).u-inline-block.u-hide-link.u-padding-small #[+icon(logo[0], logo[1], logo[2]).u-color-dark]]
else
| #[+icon(logo[0], logo[1], logo[2]).u-color-dark]
//- Link button
url - [string] link href
trusted - [boolean] if not set / false, rel="noopener nofollow" is added
@ -94,7 +119,7 @@ mixin infobox(label)
see assets/css/_components/_buttons.sass
mixin button(url, trusted, ...style)
- external = url.includes("http")
- external = url && url.includes("http")
a.c-button.u-text-label(href=url class=prefixArgs(style, "c-button") role="button" target=external ? "_blank" : null rel=external && !trusted ? "noopener nofollow" : null)&attributes(attributes)
block
@ -103,31 +128,33 @@ mixin button(url, trusted, ...style)
label - [string] aside title (optional or false for no label)
language - [string] language for syntax highlighting (default: "python")
supports basic relevant languages available for PrismJS
prompt - [string] prompt or icon to display next to code block, (mostly used for old/new)
prompt - [string] prompt displayed before first line, e.g. "$"
height - [integer] optional height to clip code block to
icon - [string] icon displayed next to code block (e.g. "accept" for new code)
wrap - [boolean] wrap text and disable horizontal scrolling
mixin code(label, language, prompt, height)
mixin code(label, language, prompt, height, icon, wrap)
pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : null style=height ? "height: #{height}px" : null)&attributes(attributes)
if label
h4.u-text-label.u-text-label--dark=label
- var icon = (prompt == 'accept' || prompt == 'reject')
- var icon = icon || (prompt == 'accept' || prompt == 'reject')
if icon
- var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'}
.c-code-block__icon(class=classes[icon] || null class=classes[icon] ? "c-code-block__icon--border" : null)
+icon(icon, 18)
code.c-code-block__content(data-prompt=icon ? null : prompt)
code.c-code-block__content(class=wrap ? "u-wrap" : null data-prompt=icon ? null : prompt)
block
//- Code blocks to display old/new versions
mixin code-old()
+code(false, false, "reject").o-block-small
+code(false, false, false, false, "reject").o-block-small
block
mixin code-new()
+code(false, false, "accept").o-block-small
+code(false, false, false, false, "accept").o-block-small
block
@ -138,12 +165,33 @@ mixin code-new()
mixin codepen(slug, height, default_tab)
figure.o-block(style="min-height: #{height}px")&attributes(attributes)
.codepen(data-height=height data-theme-id="26467" data-slug-hash=slug data-default-tab=(default_tab || "result") data-embed-version="2" data-user=SOCIAL.codepen)
.codepen(data-height=height data-theme-id="31335" data-slug-hash=slug data-default-tab=(default_tab || "result") data-embed-version="2" data-user=SOCIAL.codepen)
+a("https://codepen.io/" + SOCIAL.codepen + "/" + slug) View on CodePen
script(async src="https://assets.codepen.io/assets/embed/ei.js")
//- GitHub embed
repo - [string] repository owned by explosion organization
file - [string] logical path to file, relative to repository root
alt_file - [string] alternative file path used in footer and link button
height - [integer] height of code preview in px
mixin github(repo, file, alt_file, height)
- var branch = ALPHA ? "develop" : "master"
- var height = height || 250
figure.o-block
pre.c-code-block.o-block-small(class="lang-#{(language || DEFAULT_SYNTAX)}" style="height: #{height}px; min-height: #{height}px")
code.c-code-block__content(data-gh-embed="#{repo}/#{branch}/#{file}")
footer.o-grid.u-text
.o-block-small.u-flex-full #[+icon("github")] #[code=repo + '/' + (alt_file || file)]
div
+button(gh(repo, alt_file || file), false, "primary", "small") View on GitHub
//- Images / figures
url - [string] url or path to image
width - [integer] image width in px, for better rendering (default: 500)
@ -168,10 +216,26 @@ mixin image-caption()
block
//- Label
//- Graphic or illustration with button
original - [string] Path to original image
mixin graphic(original)
+image
block
if original
.u-text-right
+button(original, false, "secondary", "small") View large graphic
//- Labels
mixin label()
.u-text-label.u-color-subtle&attributes(attributes)
.u-text-label.u-color-dark&attributes(attributes)
block
mixin label-inline()
strong.u-text-label.u-color-dark&attributes(attributes)
block
@ -188,8 +252,10 @@ mixin tag()
mixin tag-model(...capabs)
- var intro = "To use this functionality, spaCy needs a model to be installed"
- var ext = capabs.length ? " that supports the following capabilities: " + capabs.join(', ') : ""
+tag Requires model
+help(intro + ext + ".").u-color-theme
span.u-nowrap
+tag Needs model
+help(intro + ext + ".").u-color-theme
//- "New" tag to label features new in a specific version
@ -219,15 +285,9 @@ mixin list(type, start)
//- List item (only used within +list)
mixin item(procon)
if procon
li&attributes(attributes)
+procon(procon).c-list__icon
block
else
li.c-list__item&attributes(attributes)
block
mixin item()
li.c-list__item&attributes(attributes)
block
//- Table
@ -237,9 +297,9 @@ mixin table(head)
table.c-table.o-block&attributes(attributes)
if head
+row
+row("head")
each column in head
th.c-table__head-cell.u-text-label=column
+head-cell=column
block
@ -251,10 +311,11 @@ mixin row(...style)
block
//- Footer table row (only ued within +table)
mixin footrow()
tr.c-table__row.c-table__row--foot&attributes(attributes)
//- Header table cell (only used within +row)
mixin head-cell()
th.c-table__head-cell.u-text-label&attributes(attributes)
block
@ -284,71 +345,58 @@ mixin grid-col(width)
//- Card (only used within +grid)
title - [string] card title
details - [object] url, image, author, description, tags etc.
(see /docs/usage/_data.json)
title - [string] card title
url - [string] link for card
author - [string] optional author, displayed as byline at the bottom
icon - [string] optional ID of icon displayed with card
width - [string] optional width of grid column, defaults to "half"
mixin card(title, details)
+grid-col("half").o-card.u-text&attributes(attributes)
if details.image
+a(details.url).o-block-small
img(src=details.image alt=title width="300" role="presentation")
if title
+a(details.url)
+h(3)=title
if details.author
.u-text-small.u-color-subtle by #{details.author}
if details.description || details.tags
ul
if details.description
li=details.description
if details.tags
li
each tag in details.tags
span.u-text-tag #{tag}
| &nbsp;
block
mixin card(title, url, author, icon, width)
+grid-col(width || "half").o-box.o-grid.o-grid--space.u-text&attributes(attributes)
+a(url)
h4.u-heading.u-text-label
if icon
+icon(icon, 25).u-float-right
if title
span.u-color-dark=title
.o-block-small.u-text-small
block
if author
.u-color-subtle.u-text-tiny by #{author}
//- Simpler card list item (only used within +list)
title - [string] card title
details - [object] url, image, author, description, tags etc.
(see /docs/usage/_data.json)
//- Table of contents, to be used with +item mixins for links
col - [string] width of column (see +grid-col)
mixin card-item(title, details)
+item&attributes(attributes)
+a(details.url)=title
if details.description
br
span=details.description
if details.author
br
span.u-text-small.u-color-subtle by #{details.author}
mixin table-of-contents(col)
+grid-col(col || "half")
+infobox
+label.o-block-small Table of contents
+list("numbers").u-text-small.o-no-block
block
//- Table row for models table
//- Bibliography
id - [string] ID of bibliography component, for anchor links. Can be used if
there's more than one bibliography on one page.
mixin model-row(name, lang, procon, size, license, default_model, divider)
- var licenses = { "CC BY-SA": "https://creativecommons.org/licenses/by-sa/3.0/", "CC BY-NC": "https://creativecommons.org/licenses/by-nc/3.0/" }
mixin bibliography(id)
section(id=id || "bibliography")
+infobox
+label.o-block-small Bibliography
+list("numbers").u-text-small.o-no-block
block
+row(divider ? "divider": null)
+cell #[code=name]
if default_model
| #[span.u-color-theme(title="default model") #[+icon("star", 16)]]
+cell=lang
each icon in procon
+cell.u-text-center #[+procon(icon ? "pro" : "con")]
+cell.u-text-right=size
+cell
if license in licenses
+a(licenses[license])=license
//- Footnote
id - [string / integer] ID of footnote.
bib_id - [string] ID of bibliography component, defaults to "bibliography".
tooltip - [string] optional text displayed as tooltip
mixin fn(id, bib_id, tooltip)
sup.u-padding-small(id="bib" + id data-tooltip=tooltip)
span.u-text-tag
+a("#" + (bib_id || "bibliography")).u-hide-link #{id}
//- Table rows for annotation specs
@ -383,14 +431,3 @@ mixin annotation-row(annots, style)
else
+cell=cell
block
//- Table of contents, to be used with +item mixins for links
col - [string] width of column (see +grid-col)
mixin table-of-contents(col)
+grid-col(col || "half")
+infobox
+label.o-block-small Table of contents
+list("numbers").u-text-small.o-no-block
block

View File

@ -1,19 +1,15 @@
//- 💫 INCLUDES > TOP NAVIGATION
include _mixins
nav.c-nav.u-text.js-nav(class=landing ? "c-nav--theme" : null)
a(href='/') #[+logo]
if SUBSECTION != "index"
.u-text-label.u-padding-small.u-hidden-xs=SUBSECTION
a(href="/" aria-label=SITENAME) #[+logo]
ul.c-nav__menu
- var NAV = ALPHA ? { "Usage": "/docs/usage", "Reference": "/docs/api" } : NAVIGATION
each url, item in NAV
li.c-nav__menu__item(class=(url == "/") ? "u-hidden-xs" : null)
- var current_url = '/' + current.path[0]
each url, item in NAVIGATION
li.c-nav__menu__item(class=(current_url == url) ? "is-active" : null)
+a(url)=item
li.c-nav__menu__item
+a(gh("spaCy"))(aria-label="GitHub").u-hidden-xs #[+icon("github", 20)]
li.c-nav__menu__item.u-hidden-xs
+a(gh("spaCy"))(aria-label="GitHub") #[+icon("github", 20)]
progress.c-progress.js-progress(value="0" max="1")

View File

@ -1,6 +1,6 @@
//- 💫 INCLUDES > NEWSLETTER
ul.o-block
ul.o-block-small
li.u-text-label.u-color-subtle Stay in the loop!
li Receive updates about new releases, tutorials and more.
@ -10,7 +10,6 @@ form.o-grid#mc-embedded-subscribe-form(action="//#{MAILCHIMP.user}.list-manage.c
div(style="position: absolute; left: -5000px;" aria-hidden="true")
input(type="text" name="b_#{MAILCHIMP.id}_#{MAILCHIMP.list}" tabindex="-1" value="")
.o-grid-col.u-border.u-padding-small
input#mce-EMAIL.u-text(type="email" name="EMAIL" placeholder="Your email")
button#mc-embedded-subscribe.u-text-label.u-color-theme(type="submit" name="subscribe") Sign up
.o-grid-col.o-grid.o-grid--nowrap.o-field.u-padding-small
input#mce-EMAIL.o-field__input.u-text(type="email" name="EMAIL" placeholder="Your email" aria-label="Your email")
button#mc-embedded-subscribe.o-field__button.u-text-label.u-color-theme.u-nowrap(type="submit" name="subscribe") Sign up

View File

@ -1,47 +1,56 @@
//- 💫 INCLUDES > DOCS PAGE TEMPLATE
- sidebar_content = (SUBSECTION != "index") ? public.docs[SUBSECTION]._data.sidebar : public.docs._data.sidebar || FOOTER
- sidebar_content = (public[SECTION] ? public[SECTION]._data.sidebar : public._data[SECTION] ? public._data[SECTION].sidebar : false) || FOOTER
include _sidebar
main.o-main.o-main--sidebar.o-main--aside
article.o-content
+grid.o-no-block
+grid-col(source ? "two-thirds" : "full")
+h(1)=title
if tag
+tag=tag
+h(1).u-heading--title=title.replace("'", "")
if tag
+tag=tag
if tag_new
+tag-new(tag_new)
if teaser
.u-heading__teaser.u-text-small.u-color-dark=teaser
else if IS_MODELS
.u-heading__teaser.u-text-small.u-color-dark
| Available statistical models for
| #[code=current.source] (#{LANGUAGES[current.source]}).
if source
+grid-col("third").u-text-right
.o-inline-list
+button(gh("spacy", source), false, "secondary").u-text-tag Source #[+icon("code", 14)]
.o-block.u-text-right
+button(gh("spacy", source), false, "secondary", "small").u-nowrap
| Source #[+icon("code", 14)]
//-if ALPHA
//- +alpha-info
if ALPHA
+infobox("⚠️ You are viewing the spaCy v2.0.0 alpha docs")
strong This page is part of the alpha documentation for spaCy v2.0.
| It does not reflect the state of the latest stable release.
| Because v2.0 is still under development, the implementation
| may differ from the intended state described here. See the
| #[+a(gh("spaCy") + "/releases/tag/v2.0.0-alpha") release notes]
| for details on how to install and test the new version. To
| read the official docs for spaCy v1.x,
| #[+a("https://spacy.io/docs") go here].
!=yield
if IS_MODELS
include _page_models
else
!=yield
+grid.o-content.u-text
+grid-col("half")
if next && public.docs[SUBSECTION]._data[next]
- data = public.docs[SUBSECTION]._data[next]
if !IS_MODELS
.o-inline-list
span #[strong.u-text-label Read next:] #[+a(next).u-link=data.title]
+button(gh("spacy", "website/" + current.path.join('/') + ".jade"), false, "secondary", "small")
| #[span.o-icon Suggest edits] #[+icon("code", 14)]
+grid-col("half").u-text-right
.o-inline-list
+button(gh("spacy", "website/" + current.path.join('/') + ".jade"), false, "secondary").u-text-tag Suggest edits #[+icon("code", 14)]
if next && public[SECTION]._data[next]
- data = public[SECTION]._data[next]
+grid("vcenter")
+a(next).u-text-small.u-flex-full
h4.u-text-label.u-color-dark Read next
| #{data.title}
+a(next).c-icon-button.c-icon-button--right(aria-hidden="true")
+icon("arrow-right", 24)
+gitter("spaCy chat")

View File

@ -0,0 +1,77 @@
//- 💫 INCLUDES > MODELS PAGE TEMPLATE
for id in CURRENT_MODELS
+section(id)
+grid("vcenter").o-no-block(id=id)
+grid-col("two-thirds")
+h(2)
+a("#" + id).u-permalink=id
+grid-col("third").u-text-right
.u-color-subtle.u-text-tiny
+button(gh("spacy-models") + "/releases", true, "secondary", "small")(data-tpl=id data-tpl-key="download")
| Release details
.u-padding-small Latest: #[code(data-tpl=id data-tpl-key="version") n/a]
+aside-code("Installation", "bash", "$").
spacy download #{id}
- var comps = getModelComponents(id)
p(data-tpl=id data-tpl-key="description")
div(data-tpl=id data-tpl-key="error" style="display: none")
+infobox
| Unable to load model details from GitHub. To find out more
| about this model, see the overview of the
| #[+a(gh("spacy-models") + "/releases") latest model releases].
+table(data-tpl=id data-tpl-key="table")
+row
+cell #[+label Language]
+cell #[+tag=comps.lang] #{LANGUAGES[comps.lang]}
for comp, label in {"Type": comps.type, "Genre": comps.genre}
+row
+cell #[+label=label]
+cell #[+tag=comp] #{MODEL_META[comp]}
+row
+cell #[+label Size]
+cell #[+tag=comps.size] #[span(data-tpl=id data-tpl-key="size") #[em n/a]]
each label in ["Pipeline", "Sources", "Author", "License"]
- var field = label.toLowerCase()
+row
+cell.u-nowrap
+label=label
if MODEL_META[field]
| #[+help(MODEL_META[field]).u-color-subtle]
+cell
span(data-tpl=id data-tpl-key=field) #[em n/a]
+row(data-tpl=id data-tpl-key="compat-wrapper" style="display: none")
+cell
+label Compat #[+help("Latest compatible model version for your spaCy installation").u-color-subtle]
+cell
.o-field.u-float-left
select.o-field__select.u-text-small(data-tpl=id data-tpl-key="compat")
.o-empty(data-tpl=id data-tpl-key="compat-versions") &nbsp;
section(data-tpl=id data-tpl-key="accuracy-wrapper" style="display: none")
+grid.o-no-block
+grid-col("third")
+h(4) Accuracy
+table.o-block-small
for label, field in MODEL_ACCURACY
+row(style="display: none")
+cell.u-nowrap
+label=label
if MODEL_META[field]
| #[+help(MODEL_META[field]).u-color-subtle]
+cell.u-text-right(data-tpl=id data-tpl-key=field)
| n/a
+grid-col("two-thirds")
+h(4) Comparison
+chart(id).u-padding-small
p.u-text-small.u-color-dark(data-tpl=id data-tpl-key="notes")

View File

@ -1,27 +1,46 @@
//- 💫 INCLUDES > SCRIPTS
script(src="/assets/js/main.js?v#{V_JS}")
script(src="/assets/js/prism.js")
if quickstart
script(src="/assets/js/quickstart.min.js")
if SECTION == "docs"
if quickstart
script(src="/assets/js/quickstart.js")
script var qs = new Quickstart("#qs")
if IS_PAGE
script(src="/assets/js/in-view.min.js")
script.
((window.gitter = {}).chat = {}).options = {
useStyles: false,
activationElement: '.js-gitter-button',
targetElement: '.js-gitter',
room: '!{SOCIAL.gitter}'
};
script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer)
if HAS_MODELS
script(src="/assets/js/chart.min.js")
if environment == "deploy"
script
script(async src="https://www.google-analytics.com/analytics.js")
script(src="/assets/js/prism.min.js")
script(src="/assets/js/main.js?v#{V_JS}")
script
| new ProgressBar('.js-progress');
if changelog
| new Changelog('!{SOCIAL.github}', 'spacy');
if quickstart
| new Quickstart("#qs");
if IS_PAGE
| new SectionHighlighter('data-section', 'data-nav');
| new GitHubEmbed('!{SOCIAL.github}', 'data-gh-embed');
| ((window.gitter = {}).chat = {}).options = {
| useStyles: false,
| activationElement: '.js-gitter-button',
| targetElement: '.js-gitter',
| room: '!{SOCIAL.gitter}'
| };
if HAS_MODELS
| new ModelLoader('!{MODELS_REPO}', !{JSON.stringify(CURRENT_MODELS)}, !{JSON.stringify(MODEL_LICENSES)}, !{JSON.stringify(MODEL_ACCURACY)});
if environment == "deploy"
| window.ga=window.ga||function(){
| (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date;
| ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview');
script(async src="https://www.google-analytics.com/analytics.js")
if IS_PAGE
script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer)

View File

@ -1,13 +1,23 @@
//- 💫 INCLUDES > SIDEBAR
include _mixins
menu.c-sidebar.js-sidebar.u-text
if sidebar_content
each items, menu in sidebar_content
ul.c-sidebar__section.o-block
li.u-text-label.u-color-subtle=menu
each items, sectiontitle in sidebar_content
ul.c-sidebar__section.o-block-small
li.u-text-label.u-color-dark=sectiontitle
each url, item in items
li(class=(CURRENT == url || (CURRENT == "index" && url == "./")) ? "is-active" : null)
+a(url)=item
- var is_current = CURRENT == url || (CURRENT == "index" && url == "./")
li.c-sidebar__item
+a(url)(class=is_current ? "is-active" : null)=item
if is_current
if IS_MODELS && CURRENT_MODELS.length
- menu = Object.assign({}, ...CURRENT_MODELS.map(id => ({ [id]: id })))
if menu
ul.c-sidebar__crumb.u-hidden-sm
- var counter = 0
for id, title in menu
- counter++
li.c-sidebar__crumb__item(data-nav=id class=(counter == 1) ? "is-active" : null)
+a("#section-" + id)=title

157
website/_includes/_svg.jade Normal file

File diff suppressed because one or more lines are too long

View File

@ -2,11 +2,16 @@
include _includes/_mixins
- title = IS_MODELS ? LANGUAGES[current.source] || title : title
- social_title = (SECTION == "index") ? SITENAME + " - " + SLOGAN : title + " - " + SITENAME
- social_img = SITE_URL + "/assets/img/social/preview_" + (preview || ALPHA ? "alpha" : "default") + ".jpg"
doctype html
html(lang="en")
title
if SECTION == "docs" && SUBSECTION && SUBSECTION != "index"
| #{title} | #{SITENAME} #{SUBSECTION == "api" ? "API" : "Usage"} Documentation
if SECTION == "api" || SECTION == "usage" || SECTION == "models"
- var title_section = (SECTION == "api") ? "API" : SECTION.charAt(0).toUpperCase() + SECTION.slice(1)
| #{title} | #{SITENAME} #{title_section} Documentation
else if SECTION != "index"
| #{title} | #{SITENAME}
@ -22,32 +27,30 @@ html(lang="en")
meta(property="og:type" content="website")
meta(property="og:site_name" content=sitename)
meta(property="og:url" content="#{SITE_URL}/#{current.path.join('/')}")
meta(property="og:title" content="#{title} - spaCy")
meta(property="og:title" content=social_title)
meta(property="og:description" content=description)
meta(property="og:image" content=getSocialImg())
meta(property="og:image" content=social_img)
meta(name="twitter:card" content="summary_large_image")
meta(name="twitter:site" content="@" + SOCIAL.twitter)
meta(name="twitter:title" content="#{title} - spaCy")
meta(name="twitter:title" content=social_title)
meta(name="twitter:description" content=description)
meta(name="twitter:image" content=getSocialImg())
meta(name="twitter:image" content=social_img)
link(rel="shortcut icon" href="/assets/img/favicon.ico")
link(rel="icon" type="image/x-icon" href="/assets/img/favicon.ico")
if ALPHA && SECTION == "docs"
if SECTION == "api"
link(href="/assets/css/style_green.css?v#{V_CSS}" rel="stylesheet")
else if SUBSECTION == "usage"
link(href="/assets/css/style_red.css?v#{V_CSS}" rel="stylesheet")
else
link(href="/assets/css/style.css?v#{V_CSS}" rel="stylesheet")
body
include _includes/_svg
include _includes/_navigation
if SECTION == "docs"
if !landing
include _includes/_page-docs
else

View File

@ -0,0 +1,43 @@
//- 💫 DOCS > API > ANNOTATION > BILUO
+table([ "Tag", "Description" ])
+row
+cell #[code #[span.u-color-theme B] EGIN]
+cell The first token of a multi-token entity.
+row
+cell #[code #[span.u-color-theme I] N]
+cell An inner token of a multi-token entity.
+row
+cell #[code #[span.u-color-theme L] AST]
+cell The final token of a multi-token entity.
+row
+cell #[code #[span.u-color-theme U] NIT]
+cell A single-token entity.
+row
+cell #[code #[span.u-color-theme O] UT]
+cell A non-entity token.
+aside("Why BILUO, not IOB?")
| There are several coding schemes for encoding entity annotations as
| token tags. These coding schemes are equally expressive, but not
| necessarily equally learnable.
| #[+a("http://www.aclweb.org/anthology/W09-1119") Ratinov and Roth]
| showed that the minimal #[strong Begin], #[strong In], #[strong Out]
| scheme was more difficult to learn than the #[strong BILUO] scheme that
| we use, which explicitly marks boundary tokens.
p
| spaCy translates the character offsets into this scheme, in order to
| decide the cost of each action given the current state of the entity
| recogniser. The costs are then used to calculate the gradient of the
| loss, to train the model. The exact algorithm is a pastiche of
| well-known methods, and is not currently described in any single
| publication. The model is a greedy transition-based parser guided by a
| linear model whose weights are learned using the averaged perceptron
| loss, via the #[+a("http://www.aclweb.org/anthology/C12-1059") dynamic oracle]
| imitation learning strategy. The transition system is equivalent to the
| BILOU tagging scheme.

View File

@ -0,0 +1,115 @@
//- 💫 DOCS > API > ARCHITECTURE > CYTHON
+aside("What's Cython?")
| #[+a("http://cython.org/") Cython] is a language for writing
| C extensions for Python. Most Python code is also valid Cython, but
| you can add type declarations to get efficient memory-managed code
| just like C or C++.
p
| spaCy's core data structures are implemented as
| #[+a("http://cython.org/") Cython] #[code cdef] classes. Memory is
| managed through the #[+a(gh("cymem")) #[code cymem]]
| #[code cymem.Pool] class, which allows you
| to allocate memory which will be freed when the #[code Pool] object
| is garbage collected. This means you usually don't have to worry
| about freeing memory. You just have to decide which Python object
| owns the memory, and make it own the #[code Pool]. When that object
| goes out of scope, the memory will be freed. You do have to take
| care that no pointers outlive the object that owns them — but this
| is generally quite easy.
p
| All Cython modules should have the #[code # cython: infer_types=True]
| compiler directive at the top of the file. This makes the code much
| cleaner, as it avoids the need for many type declarations. If
| possible, you should prefer to declare your functions #[code nogil],
| even if you don't especially care about multi-threading. The reason
| is that #[code nogil] functions help the Cython compiler reason about
| your code quite a lot — you're telling the compiler that no Python
| dynamics are possible. This lets many errors be raised, and ensures
| your function will run at C speed.
p
| Cython gives you many choices of sequences: you could have a Python
| list, a numpy array, a memory view, a C++ vector, or a pointer.
| Pointers are preferred, because they are fastest, have the most
| explicit semantics, and let the compiler check your code more
| strictly. C++ vectors are also great — but you should only use them
| internally in functions. It's less friendly to accept a vector as an
| argument, because that asks the user to do much more work. Here's
| how to get a pointer from a numpy array, memory view or vector:
+code.
cdef void get_pointers(np.ndarray[int, mode='c'] numpy_array, vector[int] cpp_vector, int[::1] memory_view) nogil:
pointer1 = &lt;int*&gt;numpy_array.data
pointer2 = cpp_vector.data()
pointer3 = &memory_view[0]
p
| Both C arrays and C++ vectors reassure the compiler that no Python
| operations are possible on your variable. This is a big advantage:
| it lets the Cython compiler raise many more errors for you.
p
| When getting a pointer from a numpy array or memoryview, take care
| that the data is actually stored in C-contiguous order — otherwise
| you'll get a pointer to nonsense. The type-declarations in the code
| above should generate runtime errors if buffers with incorrect
| memory layouts are passed in. To iterate over the array, the
| following style is preferred:
+code.
cdef int c_total(const int* int_array, int length) nogil:
total = 0
for item in int_array[:length]:
total += item
return total
p
| If this is confusing, consider that the compiler couldn't deal with
| #[code for item in int_array:] — there's no length attached to a raw
| pointer, so how could we figure out where to stop? The length is
| provided in the slice notation as a solution to this. Note that we
| don't have to declare the type of #[code item] in the code above —
| the compiler can easily infer it. This gives us tidy code that looks
| quite like Python, but is exactly as fast as C — because we've made
| sure the compilation to C is trivial.
p
| Your functions cannot be declared #[code nogil] if they need to
| create Python objects or call Python functions. This is perfectly
| okay — you shouldn't torture your code just to get #[code nogil]
| functions. However, if your function isn't #[code nogil], you should
| compile your module with #[code cython -a --cplus my_module.pyx] and
| open the resulting #[code my_module.html] file in a browser. This
| will let you see how Cython is compiling your code. Calls into the
| Python run-time will be in bright yellow. This lets you easily see
| whether Cython is able to correctly type your code, or whether there
| are unexpected problems.
p
| Working in Cython is very rewarding once you're over the initial
| learning curve. As with C and C++, the first way you write something
| in Cython will often be the performance-optimal approach. In
| contrast, Python optimisation generally requires a lot of
| experimentation. Is it faster to have an #[code if item in my_dict]
| check, or to use #[code .get()]? What about
| #[code try]/#[code except]? Does this numpy operation create a copy?
| There's no way to guess the answers to these questions, and you'll
| usually be dissatisfied with your results — so there's no way to
| know when to stop this process. In the worst case, you'll make a
| mess that invites the next reader to try their luck too. This is
| like one of those
| #[+a("http://www.wemjournal.org/article/S1080-6032%2809%2970088-2/abstract") volcanic gas-traps],
| where the rescuers keep passing out from low oxygen, causing
| another rescuer to follow — only to succumb themselves. In short,
| just say no to optimizing your Python. If it's not fast enough the
| first time, just switch to Cython.
+infobox("Resources")
+list.o-no-block
+item #[+a("http://docs.cython.org/en/latest/") Official Cython documentation] (cython.org)
+item #[+a("https://explosion.ai/blog/writing-c-in-cython", true) Writing C in Cython] (explosion.ai)
+item #[+a("https://explosion.ai/blog/multithreading-with-cython") Multi-threading spaCys parser and named entity recogniser] (explosion.ai)

View File

@ -0,0 +1,141 @@
//- 💫 DOCS > API > ARCHITECTURE > NN MODEL ARCHITECTURE
p
| The parsing model is a blend of recent results. The two recent
| inspirations have been the work of Eli Klipperwasser and Yoav Goldberg at
| Bar Ilan#[+fn(1)], and the SyntaxNet team from Google. The foundation of
| the parser is still based on the work of Joakim Nivre#[+fn(2)], who
| introduced the transition-based framework#[+fn(3)], the arc-eager
| transition system, and the imitation learning objective. The model is
| implemented using #[+a(gh("thinc")) Thinc], spaCy's machine learning
| library. We first predict context-sensitive vectors for each word in the
| input:
+code.
(embed_lower | embed_prefix | embed_suffix | embed_shape)
&gt;&gt; Maxout(token_width)
&gt;&gt; convolution ** 4
p
| This convolutional layer is shared between the tagger, parser and NER,
| and will also be shared by the future neural lemmatizer. Because the
| parser shares these layers with the tagger, the parser does not require
| tag features. I got this trick from David Weiss's "Stack Combination"
| paper#[+fn(4)].
p
| To boost the representation, the tagger actually predicts a "super tag"
| with POS, morphology and dependency label#[+fn(5)]. The tagger predicts
| these supertags by adding a softmax layer onto the convolutional layer
| so, we're teaching the convolutional layer to give us a representation
| that's one affine transform from this informative lexical information.
| This is obviously good for the parser (which backprops to the
| convolutions too). The parser model makes a state vector by concatenating
| the vector representations for its context tokens. The current context
| tokens:
+table
+row
+cell #[code S0], #[code S1], #[code S2]
+cell Top three words on the stack.
+row
+cell #[code B0], #[code B1]
+cell First two words of the buffer.
+row
+cell.u-nowrap
| #[code S0L1], #[code S1L1], #[code S2L1], #[code B0L1],
| #[code B1L1]#[br]
| #[code S0L2], #[code S1L2], #[code S2L2], #[code B0L2],
| #[code B1L2]
+cell
| Leftmost and second leftmost children of #[code S0], #[code S1],
| #[code S2], #[code B0] and #[code B1].
+row
+cell.u-nowrap
| #[code S0R1], #[code S1R1], #[code S2R1], #[code B0R1],
| #[code B1R1]#[br]
| #[code S0R2], #[code S1R2], #[code S2R2], #[code B0R2],
| #[code B1R2]
+cell
| Rightmost and second rightmost children of #[code S0], #[code S1],
| #[code S2], #[code B0] and #[code B1].
p
| This makes the state vector quite long: #[code 13*T], where #[code T] is
| the token vector width (128 is working well). Fortunately, there's a way
| to structure the computation to save some expense (and make it more
| GPU-friendly).
p
| The parser typically visits #[code 2*N] states for a sentence of length
| #[code N] (although it may visit more, if it back-tracks with a
| non-monotonic transition#[+fn(4)]). A naive implementation would require
| #[code 2*N (B, 13*T) @ (13*T, H)] matrix multiplications for a batch of
| size #[code B]. We can instead perform one #[code (B*N, T) @ (T, 13*H)]
| multiplication, to pre-compute the hidden weights for each positional
| feature with respect to the words in the batch. (Note that our token
| vectors come from the CNN — so we can't play this trick over the
| vocabulary. That's how Stanford's NN parser#[+fn(3)] works — and why its
| model is so big.)
p
| This pre-computation strategy allows a nice compromise between
| GPU-friendliness and implementation simplicity. The CNN and the wide
| lower layer are computed on the GPU, and then the precomputed hidden
| weights are moved to the CPU, before we start the transition-based
| parsing process. This makes a lot of things much easier. We don't have to
| worry about variable-length batch sizes, and we don't have to implement
| the dynamic oracle in CUDA to train.
p
| Currently the parser's loss function is multilabel log loss#[+fn(6)], as
| the dynamic oracle allows multiple states to be 0 cost. This is defined
| as follows, where #[code gZ] is the sum of the scores assigned to gold
| classes:
+code.
(exp(score) / Z) - (exp(score) / gZ)
+bibliography
+item
| #[+a("https://www.semanticscholar.org/paper/Simple-and-Accurate-Dependency-Parsing-Using-Bidir-Kiperwasser-Goldberg/3cf31ecb2724b5088783d7c96a5fc0d5604cbf41") Simple and Accurate Dependency Parsing Using Bidirectional LSTM Feature Representations]
br
| Eliyahu Kiperwasser, Yoav Goldberg. (2016)
+item
| #[+a("https://www.semanticscholar.org/paper/A-Dynamic-Oracle-for-Arc-Eager-Dependency-Parsing-Goldberg-Nivre/22697256ec19ecc3e14fcfc63624a44cf9c22df4") A Dynamic Oracle for Arc-Eager Dependency Parsing]
br
| Yoav Goldberg, Joakim Nivre (2012)
+item
| #[+a("https://explosion.ai/blog/parsing-english-in-python") Parsing English in 500 Lines of Python]
br
| Matthew Honnibal (2013)
+item
| #[+a("https://www.semanticscholar.org/paper/Stack-propagation-Improved-Representation-Learning-Zhang-Weiss/0c133f79b23e8c680891d2e49a66f0e3d37f1466") Stack-propagation: Improved Representation Learning for Syntax]
br
| Yuan Zhang, David Weiss (2016)
+item
| #[+a("https://www.semanticscholar.org/paper/Deep-multi-task-learning-with-low-level-tasks-supe-S%C3%B8gaard-Goldberg/03ad06583c9721855ccd82c3d969a01360218d86") Deep multi-task learning with low level tasks supervised at lower layers]
br
| Anders Søgaard, Yoav Goldberg (2016)
+item
| #[+a("https://www.semanticscholar.org/paper/An-Improved-Non-monotonic-Transition-System-for-De-Honnibal-Johnson/4094cee47ade13b77b5ab4d2e6cb9dd2b8a2917c") An Improved Non-monotonic Transition System for Dependency Parsing]
br
| Matthew Honnibal, Mark Johnson (2015)
+item
| #[+a("http://cs.stanford.edu/people/danqi/papers/emnlp2014.pdf") A Fast and Accurate Dependency Parser using Neural Networks]
br
| Danqi Cheng, Christopher D. Manning (2014)
+item
| #[+a("https://www.semanticscholar.org/paper/Parsing-the-Wall-Street-Journal-using-a-Lexical-Fu-Riezler-King/0ad07862a91cd59b7eb5de38267e47725a62b8b2") Parsing the Wall Street Journal using a Lexical-Functional Grammar and Discriminative Estimation Techniques]
br
| Stefan Riezler et al. (2002)

View File

@ -1,29 +1,32 @@
{
"sidebar": {
"Introduction": {
"Facts & Figures": "./",
"Languages": "language-models",
"Annotation Specs": "annotation"
"Overview": {
"Architecture": "./",
"Annotation Specs": "annotation",
"Functions": "top-level"
},
"Top-level": {
"spacy": "spacy",
"displacy": "displacy",
"Utility Functions": "util",
"Command line": "cli"
},
"Classes": {
"Containers": {
"Doc": "doc",
"Token": "token",
"Span": "span",
"Lexeme": "lexeme"
},
"Pipeline": {
"Language": "language",
"Tokenizer": "tokenizer",
"Pipe": "pipe",
"Tensorizer": "tensorizer",
"Tagger": "tagger",
"DependencyParser": "dependencyparser",
"EntityRecognizer": "entityrecognizer",
"TextCategorizer": "textcategorizer",
"Tokenizer": "tokenizer",
"Lemmatizer": "lemmatizer",
"Matcher": "matcher",
"Lexeme": "lexeme",
"PhraseMatcher": "phrasematcher"
},
"Other": {
"Vocab": "vocab",
"StringStore": "stringstore",
"Vectors": "vectors",
@ -34,52 +37,37 @@
},
"index": {
"title": "Facts & Figures",
"next": "language-models"
"title": "Architecture",
"next": "annotation",
"menu": {
"Basics": "basics",
"Neural Network Model": "nn-model",
"Cython Conventions": "cython"
}
},
"language-models": {
"title": "Languages",
"next": "philosophy"
},
"philosophy": {
"title": "Philosophy"
},
"spacy": {
"title": "spaCy top-level functions",
"source": "spacy/__init__.py",
"next": "displacy"
},
"displacy": {
"title": "displaCy",
"tag": "module",
"source": "spacy/displacy",
"next": "util"
},
"util": {
"title": "Utility Functions",
"source": "spacy/util.py",
"next": "cli"
},
"cli": {
"title": "Command Line Interface",
"source": "spacy/cli"
"top-level": {
"title": "Top-level Functions",
"menu": {
"spacy": "spacy",
"displacy": "displacy",
"Utility Functions": "util",
"Compatibility": "compat",
"Command Line": "cli"
}
},
"language": {
"title": "Language",
"tag": "class",
"teaser": "A text-processing pipeline.",
"source": "spacy/language.py"
},
"doc": {
"title": "Doc",
"tag": "class",
"teaser": "A container for accessing linguistic annotations.",
"source": "spacy/tokens/doc.pyx"
},
@ -103,6 +91,7 @@
"vocab": {
"title": "Vocab",
"teaser": "A storage class for vocabulary and other data shared across a language.",
"tag": "class",
"source": "spacy/vocab.pyx"
},
@ -115,10 +104,27 @@
"matcher": {
"title": "Matcher",
"teaser": "Match sequences of tokens, based on pattern rules.",
"tag": "class",
"source": "spacy/matcher.pyx"
},
"phrasematcher": {
"title": "PhraseMatcher",
"teaser": "Match sequences of tokens, based on documents.",
"tag": "class",
"tag_new": 2,
"source": "spacy/matcher.pyx"
},
"pipe": {
"title": "Pipe",
"teaser": "Abstract base class defining the API for pipeline components.",
"tag": "class",
"tag_new": 2,
"source": "spacy/pipeline.pyx"
},
"dependenyparser": {
"title": "DependencyParser",
"tag": "class",
@ -127,18 +133,22 @@
"entityrecognizer": {
"title": "EntityRecognizer",
"teaser": "Annotate named entities on documents.",
"tag": "class",
"source": "spacy/pipeline.pyx"
},
"textcategorizer": {
"title": "TextCategorizer",
"teaser": "Add text categorization models to spaCy pipelines.",
"tag": "class",
"tag_new": 2,
"source": "spacy/pipeline.pyx"
},
"dependencyparser": {
"title": "DependencyParser",
"teaser": "Annotate syntactic dependencies on documents.",
"tag": "class",
"source": "spacy/pipeline.pyx"
},
@ -149,15 +159,23 @@
"source": "spacy/tokenizer.pyx"
},
"lemmatizer": {
"title": "Lemmatizer",
"tag": "class"
},
"tagger": {
"title": "Tagger",
"teaser": "Annotate part-of-speech tags on documents.",
"tag": "class",
"source": "spacy/pipeline.pyx"
},
"tensorizer": {
"title": "Tensorizer",
"teaser": "Add a tensor with position-sensitive meaning representations to a document.",
"tag": "class",
"tag_new": 2,
"source": "spacy/pipeline.pyx"
},
@ -169,23 +187,38 @@
"goldcorpus": {
"title": "GoldCorpus",
"teaser": "An annotated corpus, using the JSON file format.",
"tag": "class",
"tag_new": 2,
"source": "spacy/gold.pyx"
},
"binder": {
"title": "Binder",
"tag": "class",
"tag_new": 2,
"source": "spacy/tokens/binder.pyx"
},
"vectors": {
"title": "Vectors",
"teaser": "Store, save and load word vectors.",
"tag": "class",
"tag_new": 2,
"source": "spacy/vectors.pyx"
},
"annotation": {
"title": "Annotation Specifications"
"title": "Annotation Specifications",
"teaser": "Schemes used for labels, tags and training data.",
"menu": {
"Tokenization": "tokenization",
"Sentence Boundaries": "sbd",
"POS Tagging": "pos-tagging",
"Lemmatization": "lemmatization",
"Dependencies": "dependency-parsing",
"Named Entities": "named-entities",
"Training Data": "training"
}
}
}

View File

@ -1,26 +1,17 @@
//- 💫 DOCS > USAGE > COMMAND LINE INTERFACE
include ../../_includes/_mixins
//- 💫 DOCS > API > TOP-LEVEL > COMMAND LINE INTERFACE
p
| As of v1.7.0, spaCy comes with new command line helpers to download and
| link models and show useful debugging information. For a list of available
| commands, type #[code spacy --help].
+infobox("⚠️ Deprecation note")
| As of spaCy 2.0, the #[code model] command to initialise a model data
| directory is deprecated. The command was only necessary because previous
| versions of spaCy expected a model directory to already be set up. This
| has since been changed, so you can use the #[+api("cli#train") #[code train]]
| command straight away.
+h(2, "download") Download
+h(3, "download") Download
p
| Download #[+a("/docs/usage/models") models] for spaCy. The downloader finds the
| Download #[+a("/usage/models") models] for spaCy. The downloader finds the
| best-matching compatible version, uses pip to download the model as a
| package and automatically creates a
| #[+a("/docs/usage/models#usage") shortcut link] to load the model by name.
| #[+a("/usage/models#usage") shortcut link] to load the model by name.
| Direct downloads don't perform any compatibility checks and require the
| model name to be specified with its version (e.g., #[code en_core_web_sm-1.2.0]).
@ -49,15 +40,15 @@ p
| detailed messages in case things go wrong. It's #[strong not recommended]
| to use this command as part of an automated process. If you know which
| model your project needs, you should consider a
| #[+a("/docs/usage/models#download-pip") direct download via pip], or
| #[+a("/usage/models#download-pip") direct download via pip], or
| uploading the model to a local PyPi installation and fetching it straight
| from there. This will also allow you to add it as a versioned package
| dependency to your project.
+h(2, "link") Link
+h(3, "link") Link
p
| Create a #[+a("/docs/usage/models#usage") shortcut link] for a model,
| Create a #[+a("/usage/models#usage") shortcut link] for a model,
| either a Python package or a local directory. This will let you load
| models from any location using a custom name via
| #[+api("spacy#load") #[code spacy.load()]].
@ -95,7 +86,7 @@ p
+cell flag
+cell Show help message and available arguments.
+h(2, "info") Info
+h(3, "info") Info
p
| Print information about your spaCy installation, models and local setup,
@ -122,15 +113,15 @@ p
+cell flag
+cell Show help message and available arguments.
+h(2, "convert") Convert
+h(3, "convert") Convert
p
| Convert files into spaCy's #[+a("/docs/api/annotation#json-input") JSON format]
| Convert files into spaCy's #[+a("/api/annotation#json-input") JSON format]
| for use with the #[code train] command and other experiment management
| functions. The right converter is chosen based on the file extension of
| the input file. Currently only supports #[code .conllu].
+code(false, "bash", "$").
+code(false, "bash", "$", false, false, true).
spacy convert [input_file] [output_dir] [--n-sents] [--morphology]
+table(["Argument", "Type", "Description"])
@ -159,14 +150,18 @@ p
+cell flag
+cell Show help message and available arguments.
+h(2, "train") Train
+h(3, "train") Train
p
| Train a model. Expects data in spaCy's
| #[+a("/docs/api/annotation#json-input") JSON format].
| #[+a("/api/annotation#json-input") JSON format]. On each epoch, a model
| will be saved out to the directory. Accuracy scores and model details
| will be added to a #[+a("/usage/training#models-generating") #[code meta.json]]
| to allow packaging the model using the
| #[+api("cli#package") #[code package]] command.
+code(false, "bash", "$").
spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities]
+code(false, "bash", "$", false, false, true).
spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--meta-path] [--vectors] [--no-tagger] [--no-parser] [--no-entities] [--gold-preproc]
+table(["Argument", "Type", "Description"])
+row
@ -204,6 +199,27 @@ p
+cell option
+cell Use GPU.
+row
+cell #[code --vectors], #[code -v]
+cell option
+cell Model to load vectors from.
+row
+cell #[code --meta-path], #[code -m]
+cell option
+cell
| #[+tag-new(2)] Optional path to model
| #[+a("/usage/training#models-generating") #[code meta.json]].
| All relevant properties like #[code lang], #[code pipeline] and
| #[code spacy_version] will be overwritten.
+row
+cell #[code --version], #[code -V]
+cell option
+cell
| Model version. Will be written out to the model's
| #[code meta.json] after training.
+row
+cell #[code --no-tagger], #[code -T]
+cell flag
@ -219,12 +235,18 @@ p
+cell flag
+cell Don't train NER.
+row
+cell #[code --gold-preproc], #[code -G]
+cell flag
+cell Use gold preprocessing.
+row
+cell #[code --help], #[code -h]
+cell flag
+cell Show help message and available arguments.
+h(3, "train-hyperparams") Environment variables for hyperparameters
+h(4, "train-hyperparams") Environment variables for hyperparameters
+tag-new(2)
p
| spaCy lets you set hyperparameters for training via environment variables.
@ -236,98 +258,149 @@ p
+code(false, "bash").
parser_hidden_depth=2 parser_maxout_pieces=1 train-parser
+under-construction
+table(["Name", "Description", "Default"])
+row
+cell #[code dropout_from]
+cell
+cell Initial dropout rate.
+cell #[code 0.2]
+row
+cell #[code dropout_to]
+cell
+cell Final dropout rate.
+cell #[code 0.2]
+row
+cell #[code dropout_decay]
+cell
+cell Rate of dropout change.
+cell #[code 0.0]
+row
+cell #[code batch_from]
+cell
+cell Initial batch size.
+cell #[code 1]
+row
+cell #[code batch_to]
+cell
+cell Final batch size.
+cell #[code 64]
+row
+cell #[code batch_compound]
+cell
+cell Rate of batch size acceleration.
+cell #[code 1.001]
+row
+cell #[code token_vector_width]
+cell
+cell Width of embedding tables and convolutional layers.
+cell #[code 128]
+row
+cell #[code embed_size]
+cell
+cell Number of rows in embedding tables.
+cell #[code 7500]
+row
+cell #[code parser_maxout_pieces]
+cell
+cell Number of pieces in the parser's and NER's first maxout layer.
+cell #[code 2]
+row
+cell #[code parser_hidden_depth]
+cell
+cell Number of hidden layers in the parser and NER.
+cell #[code 1]
+row
+cell #[code hidden_width]
+cell
+cell Size of the parser's and NER's hidden layers.
+cell #[code 128]
+row
+cell #[code learn_rate]
+cell
+cell Learning rate.
+cell #[code 0.001]
+row
+cell #[code optimizer_B1]
+cell
+cell Momentum for the Adam solver.
+cell #[code 0.9]
+row
+cell #[code optimizer_B2]
+cell
+cell Adagrad-momentum for the Adam solver.
+cell #[code 0.999]
+row
+cell #[code optimizer_eps]
+cell
+cell Epsylon value for the Adam solver.
+cell #[code 1e-08]
+row
+cell #[code L2_penalty]
+cell
+cell L2 regularisation penalty.
+cell #[code 1e-06]
+row
+cell #[code grad_norm_clip]
+cell
+cell Gradient L2 norm constraint.
+cell #[code 1.0]
+h(2, "package") Package
+h(3, "evaluate") Evaluate
+tag-new(2)
p
| Generate a #[+a("/docs/usage/saving-loading#generating") model Python package]
| Evaluate a model's accuracy and speed on JSON-formatted annotated data.
| Will print the results and optionally export
| #[+a("/usage/visualizers") displaCy visualizations] of a sample set of
| parses to #[code .html] files. Visualizations for the dependency parse
| and NER will be exported as separate files if the respective component
| is present in the model's pipeline.
+code(false, "bash", "$", false, false, true).
spacy evaluate [model] [data_path] [--displacy-path] [--displacy-limit] [--gpu-id] [--gold-preproc]
+table(["Argument", "Type", "Description"])
+row
+cell #[code model]
+cell positional
+cell
| Model to evaluate. Can be a package or shortcut link name, or a
| path to a model data directory.
+row
+cell #[code data_path]
+cell positional
+cell Location of JSON-formatted evaluation data.
+row
+cell #[code --displacy-path], #[code -dp]
+cell option
+cell
| Directory to output rendered parses as HTML. If not set, no
| visualizations will be generated.
+row
+cell #[code --displacy-limit], #[code -dl]
+cell option
+cell
| Number of parses to generate per file. Defaults to #[code 25].
| Keep in mind that a significantly higher number might cause the
| #[code .html] files to render slowly.
+row
+cell #[code --gpu-id], #[code -g]
+cell option
+cell GPU to use, if any. Defaults to #[code -1] for CPU.
+row
+cell #[code --gold-preproc], #[code -G]
+cell flag
+cell Use gold preprocessing.
+h(3, "package") Package
p
| Generate a #[+a("/usage/training#models-generating") model Python package]
| from an existing model data directory. All data files are copied over.
| If the path to a meta.json is supplied, or a meta.json is found in the
| input directory, this file is used. Otherwise, the data can be entered
@ -336,8 +409,8 @@ p
| sure you're always using the latest versions. This means you need to be
| connected to the internet to use this command.
+code(false, "bash", "$").
spacy package [input_dir] [output_dir] [--meta] [--force]
+code(false, "bash", "$", false, false, true).
spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] [--force]
+table(["Argument", "Type", "Description"])
+row
@ -353,14 +426,14 @@ p
+row
+cell #[code --meta-path], #[code -m]
+cell option
+cell Path to meta.json file (optional).
+cell #[+tag-new(2)] Path to meta.json file (optional).
+row
+cell #[code --create-meta], #[code -c]
+cell flag
+cell
| Create a meta.json file on the command line, even if one already
| exists in the directory.
| #[+tag-new(2)] Create a meta.json file on the command line, even
| if one already exists in the directory.
+row
+cell #[code --force], #[code -f]

View File

@ -0,0 +1,91 @@
//- 💫 DOCS > API > TOP-LEVEL > COMPATIBILITY
p
| All Python code is written in an
| #[strong intersection of Python 2 and Python 3]. This is easy in Cython,
| but somewhat ugly in Python. Logic that deals with Python or platform
| compatibility only lives in #[code spacy.compat]. To distinguish them from
| the builtin functions, replacement functions are suffixed with an
| undersocre, e.e #[code unicode_]. For specific checks, spaCy uses the
| #[code six] and #[code ftfy] packages.
+aside-code("Example").
from spacy.compat import unicode_, json_dumps
compatible_unicode = unicode_('hello world')
compatible_json = json_dumps({'key': 'value'})
+table(["Name", "Python 2", "Python 3"])
+row
+cell #[code compat.bytes_]
+cell #[code str]
+cell #[code bytes]
+row
+cell #[code compat.unicode_]
+cell #[code unicode]
+cell #[code str]
+row
+cell #[code compat.basestring_]
+cell #[code basestring]
+cell #[code str]
+row
+cell #[code compat.input_]
+cell #[code raw_input]
+cell #[code input]
+row
+cell #[code compat.json_dumps]
+cell #[code ujson.dumps] with #[code .decode('utf8')]
+cell #[code ujson.dumps]
+row
+cell #[code compat.path2str]
+cell #[code str(path)] with #[code .decode('utf8')]
+cell #[code str(path)]
+h(3, "is_config") compat.is_config
+tag function
p
| Check if a specific configuration of Python version and operating system
| matches the user's setup. Mostly used to display targeted error messages.
+aside-code("Example").
from spacy.compat import is_config
if is_config(python2=True, windows=True):
print("You are using Python 2 on Windows.")
+table(["Name", "Type", "Description"])
+row
+cell #[code python2]
+cell bool
+cell spaCy is executed with Python 2.x.
+row
+cell #[code python3]
+cell bool
+cell spaCy is executed with Python 3.x.
+row
+cell #[code windows]
+cell bool
+cell spaCy is executed on Windows.
+row
+cell #[code linux]
+cell bool
+cell spaCy is executed on Linux.
+row
+cell #[code osx]
+cell bool
+cell spaCy is executed on OS X or macOS.
+row("foot")
+cell returns
+cell bool
+cell Whether the specified configuration matches the user's platform.

View File

@ -1,14 +1,12 @@
//- 💫 DOCS > API > DISPLACY
include ../../_includes/_mixins
//- 💫 DOCS > API > TOP-LEVEL > DISPLACY
p
| As of v2.0, spaCy comes with a built-in visualization suite. For more
| info and examples, see the usage guide on
| #[+a("/docs/usage/visualizers") visualizing spaCy].
| #[+a("/usage/visualizers") visualizing spaCy].
+h(2, "serve") displacy.serve
+h(3, "displacy.serve") displacy.serve
+tag method
+tag-new(2)
@ -60,7 +58,7 @@ p
+cell bool
+cell
| Don't parse #[code Doc] and instead, expect a dict or list of
| dicts. #[+a("/docs/usage/visualizers#manual-usage") See here]
| dicts. #[+a("/usage/visualizers#manual-usage") See here]
| for formats and examples.
+cell #[code False]
@ -70,7 +68,7 @@ p
+cell Port to serve visualization.
+cell #[code 5000]
+h(2, "render") displacy.render
+h(3, "displacy.render") displacy.render
+tag method
+tag-new(2)
@ -127,24 +125,24 @@ p Render a dependency parse tree or named entity visualization.
+cell bool
+cell
| Don't parse #[code Doc] and instead, expect a dict or list of
| dicts. #[+a("/docs/usage/visualizers#manual-usage") See here]
| dicts. #[+a("/usage/visualizers#manual-usage") See here]
| for formats and examples.
+cell #[code False]
+footrow
+row("foot")
+cell returns
+cell unicode
+cell Rendered HTML markup.
+cell
+h(2, "options") Visualizer options
+h(3, "displacy_options") Visualizer options
p
| The #[code options] argument lets you specify additional settings for
| each visualizer. If a setting is not present in the options, the default
| value will be used.
+h(3, "options-dep") Dependency Visualizer options
+h(4, "options-dep") Dependency Visualizer options
+aside-code("Example").
options = {'compact': True, 'color': 'blue'}
@ -219,7 +217,7 @@ p
+cell Distance between words in px.
+cell #[code 175] / #[code 85] (compact)
+h(3, "options-ent") Named Entity Visualizer options
+h(4, "displacy_options-ent") Named Entity Visualizer options
+aside-code("Example").
options = {'ents': ['PERSON', 'ORG', 'PRODUCT'],
@ -244,6 +242,6 @@ p
p
| By default, displaCy comes with colours for all
| #[+a("/docs/api/annotation#named-entities") entity types supported by spaCy].
| #[+a("/api/annotation#named-entities") entity types supported by spaCy].
| If you're using custom entity types, you can use the #[code colors]
| setting to add your own colours for them.

View File

@ -1,15 +1,13 @@
//- 💫 DOCS > API > SPACY
//- 💫 DOCS > API > TOP-LEVEL > SPACY
include ../../_includes/_mixins
+h(2, "load") spacy.load
+h(3, "spacy.load") spacy.load
+tag function
+tag-model
p
| Load a model via its #[+a("/docs/usage/models#usage") shortcut link],
| Load a model via its #[+a("/usage/models#usage") shortcut link],
| the name of an installed
| #[+a("/docs/usage/saving-loading#generating") model package], a unicode
| #[+a("/usage/training#models-generating") model package], a unicode
| path or a #[code Path]-like object. spaCy will try resolving the load
| argument in this order. If a model is loaded from a shortcut link or
| package name, spaCy will assume it's a Python package and import it and
@ -38,25 +36,57 @@ p
+cell list
+cell
| Names of pipeline components to
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable].
| #[+a("/usage/processing-pipelines#disabling") disable].
+footrow
+row("foot")
+cell returns
+cell #[code Language]
+cell A #[code Language] object with the loaded model.
+infobox("⚠️ Deprecation note")
+infobox("Deprecation note", "⚠️")
.o-block
| As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy
| will also raise an error if no model could be loaded and never just
| return an empty #[code Language] object. If you need a blank language,
| you need to import it explicitly (#[code from spacy.lang.en import English])
| or use #[+api("util#get_lang_class") #[code util.get_lang_class]].
| you can use the new function #[+api("spacy#blank") #[code spacy.blank()]]
| or import the class explicitly, e.g.
| #[code from spacy.lang.en import English].
+code-new nlp = spacy.load('/model')
+code-old nlp = spacy.load('en', path='/model')
+h(2, "info") spacy.info
+h(3, "spacy.blank") spacy.blank
+tag function
+tag-new(2)
p
| Create a blank model of a given language class. This function is the
| twin of #[code spacy.load()].
+aside-code("Example").
nlp_en = spacy.blank('en')
nlp_de = spacy.blank('de')
+table(["Name", "Type", "Description"])
+row
+cell #[code name]
+cell unicode
+cell ISO code of the language class to load.
+row
+cell #[code disable]
+cell list
+cell
| Names of pipeline components to
| #[+a("/usage/processing-pipelines#disabling") disable].
+row("foot")
+cell returns
+cell #[code Language]
+cell An empty #[code Language] object of the appropriate subclass.
+h(4, "spacy.info") spacy.info
+tag function
p
@ -83,13 +113,13 @@ p
+cell Print information as Markdown.
+h(2, "explain") spacy.explain
+h(3, "spacy.explain") spacy.explain
+tag function
p
| Get a description for a given POS tag, dependency label or entity type.
| For a list of available terms, see
| #[+src(gh("spacy", "spacy/glossary.py")) glossary.py].
| #[+src(gh("spacy", "spacy/glossary.py")) #[code glossary.py]].
+aside-code("Example").
spacy.explain('NORP')
@ -107,18 +137,18 @@ p
+cell unicode
+cell Term to explain.
+footrow
+row("foot")
+cell returns
+cell unicode
+cell The explanation, or #[code None] if not found in the glossary.
+h(2, "set_factory") spacy.set_factory
+h(3, "spacy.set_factory") spacy.set_factory
+tag function
+tag-new(2)
p
| Set a factory that returns a custom
| #[+a("/docs/usage/language-processing-pipeline") processing pipeline]
| #[+a("/usage/processing-pipelines") processing pipeline]
| component. Factories are useful for creating stateful components, especially ones which depend on shared data.
+aside-code("Example").

View File

@ -1,10 +1,8 @@
//- 💫 DOCS > API > UTIL
include ../../_includes/_mixins
//- 💫 DOCS > API > TOP-LEVEL > UTIL
p
| spaCy comes with a small collection of utility functions located in
| #[+src(gh("spaCy", "spacy/util.py")) spacy/util.py].
| #[+src(gh("spaCy", "spacy/util.py")) #[code spacy/util.py]].
| Because utility functions are mostly intended for
| #[strong internal use within spaCy], their behaviour may change with
| future releases. The functions documented on this page should be safe
@ -12,7 +10,7 @@ p
| recommend having additional tests in place if your application depends on
| any of spaCy's utilities.
+h(2, "get_data_path") util.get_data_path
+h(3, "util.get_data_path") util.get_data_path
+tag function
p
@ -25,12 +23,12 @@ p
+cell bool
+cell Only return path if it exists, otherwise return #[code None].
+footrow
+row("foot")
+cell returns
+cell #[code Path] / #[code None]
+cell Data path or #[code None].
+h(2, "set_data_path") util.set_data_path
+h(3, "util.set_data_path") util.set_data_path
+tag function
p
@ -47,12 +45,12 @@ p
+cell unicode or #[code Path]
+cell Path to new data directory.
+h(2, "get_lang_class") util.get_lang_class
+h(3, "util.get_lang_class") util.get_lang_class
+tag function
p
| Import and load a #[code Language] class. Allows lazy-loading
| #[+a("/docs/usage/adding-languages") language data] and importing
| #[+a("/usage/adding-languages") language data] and importing
| languages using the two-letter language code.
+aside-code("Example").
@ -67,12 +65,12 @@ p
+cell unicode
+cell Two-letter language code, e.g. #[code 'en'].
+footrow
+row("foot")
+cell returns
+cell #[code Language]
+cell Language class.
+h(2, "load_model") util.load_model
+h(3, "util.load_model") util.load_model
+tag function
+tag-new(2)
@ -101,12 +99,12 @@ p
+cell -
+cell Specific overrides, like pipeline components to disable.
+footrow
+row("foot")
+cell returns
+cell #[code Language]
+cell #[code Language] class with the loaded model.
+h(2, "load_model_from_path") util.load_model_from_path
+h(3, "util.load_model_from_path") util.load_model_from_path
+tag function
+tag-new(2)
@ -139,18 +137,18 @@ p
+cell -
+cell Specific overrides, like pipeline components to disable.
+footrow
+row("foot")
+cell returns
+cell #[code Language]
+cell #[code Language] class with the loaded model.
+h(2, "load_model_from_init_py") util.load_model_from_init_py
+h(3, "util.load_model_from_init_py") util.load_model_from_init_py
+tag function
+tag-new(2)
p
| A helper function to use in the #[code load()] method of a model package's
| #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) __init__.py].
| #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) #[code __init__.py]].
+aside-code("Example").
from spacy.util import load_model_from_init_py
@ -169,12 +167,12 @@ p
+cell -
+cell Specific overrides, like pipeline components to disable.
+footrow
+row("foot")
+cell returns
+cell #[code Language]
+cell #[code Language] class with the loaded model.
+h(2, "get_model_meta") util.get_model_meta
+h(3, "util.get_model_meta") util.get_model_meta
+tag function
+tag-new(2)
@ -190,17 +188,17 @@ p
+cell unicode or #[code Path]
+cell Path to model directory.
+footrow
+row("foot")
+cell returns
+cell dict
+cell The model's meta data.
+h(2, "is_package") util.is_package
+h(3, "util.is_package") util.is_package
+tag function
p
| Check if string maps to a package installed via pip. Mainly used to
| validate #[+a("/docs/usage/models") model packages].
| validate #[+a("/usage/models") model packages].
+aside-code("Example").
util.is_package('en_core_web_sm') # True
@ -212,18 +210,18 @@ p
+cell unicode
+cell Name of package.
+footrow
+row("foot")
+cell returns
+cell #[code bool]
+cell #[code True] if installed package, #[code False] if not.
+h(2, "get_package_path") util.get_package_path
+h(3, "util.get_package_path") util.get_package_path
+tag function
+tag-new(2)
p
| Get path to an installed package. Mainly used to resolve the location of
| #[+a("/docs/usage/models") model packages]. Currently imports the package
| #[+a("/usage/models") model packages]. Currently imports the package
| to find its path.
+aside-code("Example").
@ -236,12 +234,12 @@ p
+cell unicode
+cell Name of installed package.
+footrow
+row("foot")
+cell returns
+cell #[code Path]
+cell Path to model package directory.
+h(2, "is_in_jupyter") util.is_in_jupyter
+h(3, "util.is_in_jupyter") util.is_in_jupyter
+tag function
+tag-new(2)
@ -257,17 +255,17 @@ p
return display(HTML(html))
+table(["Name", "Type", "Description"])
+footrow
+row("foot")
+cell returns
+cell bool
+cell #[code True] if in Jupyter, #[code False] if not.
+h(2, "update_exc") util.update_exc
+h(3, "util.update_exc") util.update_exc
+tag function
p
| Update, validate and overwrite
| #[+a("/docs/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions].
| #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions].
| Used to combine global exceptions with custom, language-specific
| exceptions. Will raise an error if key doesn't match #[code ORTH] values.
@ -288,20 +286,20 @@ p
+cell dicts
+cell Exception dictionaries to add to the base exceptions, in order.
+footrow
+row("foot")
+cell returns
+cell dict
+cell Combined tokenizer exceptions.
+h(2, "prints") util.prints
+h(3, "util.prints") util.prints
+tag function
+tag-new(2)
p
| Print a formatted, text-wrapped message with optional title. If a text
| argument is a #[code Path], it's converted to a string. Should only
| be used for interactive components like the #[+api("cli") cli].
| be used for interactive components like the command-line interface.
+aside-code("Example").
data_path = Path('/some/path')

131
website/api/annotation.jade Normal file
View File

@ -0,0 +1,131 @@
//- 💫 DOCS > API > ANNOTATION SPECS
include ../_includes/_mixins
p This document describes the target annotations spaCy is trained to predict.
+section("tokenization")
+h(2, "tokenization") Tokenization
p
| Tokenization standards are based on the
| #[+a("https://catalog.ldc.upenn.edu/LDC2013T19") OntoNotes 5] corpus.
| The tokenizer differs from most by including tokens for significant
| whitespace. Any sequence of whitespace characters beyond a single space
| (#[code ' ']) is included as a token.
+aside-code("Example").
from spacy.lang.en import English
nlp = English()
tokens = nlp('Some\nspaces and\ttab characters')
tokens_text = [t.text for t in tokens]
assert tokens_text == ['Some', '\n', 'spaces', ' ', 'and',
'\t', 'tab', 'characters']
p
| The whitespace tokens are useful for much the same reason punctuation is
| it's often an important delimiter in the text. By preserving it in the
| token output, we are able to maintain a simple alignment between the
| tokens and the original string, and we ensure that no information is
| lost during processing.
+section("sbd")
+h(2, "sentence-boundary") Sentence boundary detection
p
| Sentence boundaries are calculated from the syntactic parse tree, so
| features such as punctuation and capitalisation play an important but
| non-decisive role in determining the sentence boundaries. Usually this
| means that the sentence boundaries will at least coincide with clause
| boundaries, even given poorly punctuated text.
+section("pos-tagging")
+h(2, "pos-tagging") Part-of-speech Tagging
+aside("Tip: Understanding tags")
| You can also use #[code spacy.explain()] to get the description for the
| string representation of a tag. For example,
| #[code spacy.explain("RB")] will return "adverb".
include _annotation/_pos-tags
+section("lemmatization")
+h(2, "lemmatization") Lemmatization
p A "lemma" is the uninflected form of a word. In English, this means:
+list
+item #[strong Adjectives]: The form like "happy", not "happier" or "happiest"
+item #[strong Adverbs]: The form like "badly", not "worse" or "worst"
+item #[strong Nouns]: The form like "dog", not "dogs"; like "child", not "children"
+item #[strong Verbs]: The form like "write", not "writes", "writing", "wrote" or "written"
p
| The lemmatization data is taken from
| #[+a("https://wordnet.princeton.edu") WordNet]. However, we also add a
| special case for pronouns: all pronouns are lemmatized to the special
| token #[code -PRON-].
+infobox("About spaCy's custom pronoun lemma")
| Unlike verbs and common nouns, there's no clear base form of a personal
| pronoun. Should the lemma of "me" be "I", or should we normalize person
| as well, giving "it" — or maybe "he"? spaCy's solution is to introduce a
| novel symbol, #[code -PRON-], which is used as the lemma for
| all personal pronouns.
+section("dependency-parsing")
+h(2, "dependency-parsing") Syntactic Dependency Parsing
+aside("Tip: Understanding labels")
| You can also use #[code spacy.explain()] to get the description for the
| string representation of a label. For example,
| #[code spacy.explain("prt")] will return "particle".
include _annotation/_dep-labels
+section("named-entities")
+h(2, "named-entities") Named Entity Recognition
+aside("Tip: Understanding entity types")
| You can also use #[code spacy.explain()] to get the description for the
| string representation of an entity label. For example,
| #[code spacy.explain("LANGUAGE")] will return "any named language".
include _annotation/_named-entities
+h(3, "biluo") BILUO Scheme
include _annotation/_biluo
+section("training")
+h(2, "json-input") JSON input format for training
+under-construction
p spaCy takes training data in the following format:
+code("Example structure").
doc: {
id: string,
paragraphs: [{
raw: string,
sents: [int],
tokens: [{
start: int,
tag: string,
head: int,
dep: string
}],
ner: [{
start: int,
end: int,
label: string
}],
brackets: [{
start: int,
end: int,
label: string
}]
}]
}

View File

@ -1,6 +1,6 @@
//- 💫 DOCS > API > BINDER
include ../../_includes/_mixins
include ../_includes/_mixins
p A container class for serializing collections of #[code Doc] objects.

View File

@ -0,0 +1,5 @@
//- 💫 DOCS > API > DEPENDENCYPARSER
include ../_includes/_mixins
!=partial("pipe", { subclass: "DependencyParser", short: "parser", pipeline_id: "parser" })

Some files were not shown because too many files have changed in this diff Show More