Wrap try/except around model saving

This commit is contained in:
Matthew Honnibal 2017-10-05 08:14:24 -05:00
commit c6cd81f192
331 changed files with 10443 additions and 10377 deletions

View File

@ -1 +1,55 @@
environment:
matrix:
# For Python versions available on Appveyor, see
# http://www.appveyor.com/docs/installed-software#python
# The list here is complete (excluding Python 2.6, which
# isn't covered by this document) at the time of writing.
- PYTHON: "C:\\Python27"
#- PYTHON: "C:\\Python33"
#- PYTHON: "C:\\Python34"
#- PYTHON: "C:\\Python35"
#- PYTHON: "C:\\Python27-x64"
#- PYTHON: "C:\\Python33-x64"
#- DISTUTILS_USE_SDK: "1"
#- PYTHON: "C:\\Python34-x64"
#- DISTUTILS_USE_SDK: "1"
#- PYTHON: "C:\\Python35-x64"
- PYTHON: "C:\\Python36-x64"
install:
# We need wheel installed to build wheels
- "%PYTHON%\\python.exe -m pip install wheel"
- "%PYTHON%\\python.exe -m pip install cython"
- "%PYTHON%\\python.exe -m pip install -r requirements.txt"
- "%PYTHON%\\python.exe -m pip install -e ."
build: off build: off
test_script:
# Put your test command here.
# If you don't need to build C extensions on 64-bit Python 3.3 or 3.4,
# you can remove "build.cmd" from the front of the command, as it's
# only needed to support those cases.
# Note that you must use the environment variable %PYTHON% to refer to
# the interpreter you're using - Appveyor does not do anything special
# to put the Python version you want to use on PATH.
- "%PYTHON%\\python.exe -m pytest spacy/"
after_test:
# This step builds your wheels.
# Again, you only need build.cmd if you're building C extensions for
# 64-bit Python 3.3/3.4. And you need to use %PYTHON% to get the correct
# interpreter
- "%PYTHON%\\python.exe setup.py bdist_wheel"
artifacts:
# bdist_wheel puts your built wheel in the dist directory
- path: dist\*
#on_success:
# You can use this step to upload your artifacts to a public website.
# See Appveyor's documentation for more details. Or you can simply
# access your wheels from the Appveyor "artifacts" tab for your build.

11
.buildkite/sdist.yml Normal file
View File

@ -0,0 +1,11 @@
steps:
-
command: "fab env clean make test sdist"
label: ":dizzy: :python:"
artifact_paths: "dist/*.tar.gz"
- wait
- trigger: "spacy-sdist-against-models"
label: ":dizzy: :hammer:"
build:
env:
SPACY_VERSION: "{$SPACY_VERSION}"

4
.gitignore vendored
View File

@ -1,14 +1,12 @@
# spaCy # spaCy
spacy/data/ spacy/data/
corpora/ corpora/
models/ /models/
keys/ keys/
# Website # Website
website/www/ website/www/
website/_deploy.sh website/_deploy.sh
website/package.json
website/announcement.jade
website/.gitignore website/.gitignore
# Cython / C extensions # Cython / C extensions

View File

@ -1,322 +0,0 @@
'''WIP --- Doesn't work well yet'''
import plac
import random
import six
import cProfile
import pstats
import pathlib
import cPickle as pickle
from itertools import izip
import spacy
import cytoolz
import cupy as xp
import cupy.cuda
import chainer.cuda
import chainer.links as L
import chainer.functions as F
from chainer import Chain, Variable, report
import chainer.training
import chainer.optimizers
from chainer.training import extensions
from chainer.iterators import SerialIterator
from chainer.datasets import TupleDataset
class SentimentAnalyser(object):
@classmethod
def load(cls, path, nlp, max_length=100):
raise NotImplementedError
#with (path / 'config.json').open() as file_:
# model = model_from_json(file_.read())
#with (path / 'model').open('rb') as file_:
# lstm_weights = pickle.load(file_)
#embeddings = get_embeddings(nlp.vocab)
#model.set_weights([embeddings] + lstm_weights)
#return cls(model, max_length=max_length)
def __init__(self, model, max_length=100):
self._model = model
self.max_length = max_length
def __call__(self, doc):
X = get_features([doc], self.max_length)
y = self._model.predict(X)
self.set_sentiment(doc, y)
def pipe(self, docs, batch_size=1000, n_threads=2):
for minibatch in cytoolz.partition_all(batch_size, docs):
minibatch = list(minibatch)
sentences = []
for doc in minibatch:
sentences.extend(doc.sents)
Xs = get_features(sentences, self.max_length)
ys = self._model.predict(Xs)
for sent, label in zip(sentences, ys):
sent.doc.sentiment += label - 0.5
for doc in minibatch:
yield doc
def set_sentiment(self, doc, y):
doc.sentiment = float(y[0])
# Sentiment has a native slot for a single float.
# For arbitrary data storage, there's:
# doc.user_data['my_data'] = y
class Classifier(Chain):
def __init__(self, predictor):
super(Classifier, self).__init__(predictor=predictor)
def __call__(self, x, t):
y = self.predictor(x)
loss = F.softmax_cross_entropy(y, t)
accuracy = F.accuracy(y, t)
report({'loss': loss, 'accuracy': accuracy}, self)
return loss
class SentimentModel(Chain):
def __init__(self, nlp, shape, **settings):
Chain.__init__(self,
embed=_Embed(shape['nr_vector'], shape['nr_dim'], shape['nr_hidden'],
set_vectors=lambda arr: set_vectors(arr, nlp.vocab)),
encode=_Encode(shape['nr_hidden'], shape['nr_hidden']),
attend=_Attend(shape['nr_hidden'], shape['nr_hidden']),
predict=_Predict(shape['nr_hidden'], shape['nr_class']))
self.to_gpu(0)
def __call__(self, sentence):
return self.predict(
self.attend(
self.encode(
self.embed(sentence))))
class _Embed(Chain):
def __init__(self, nr_vector, nr_dim, nr_out, set_vectors=None):
Chain.__init__(self,
embed=L.EmbedID(nr_vector, nr_dim, initialW=set_vectors),
project=L.Linear(None, nr_out, nobias=True))
self.embed.W.volatile = False
def __call__(self, sentence):
return [self.project(self.embed(ts)) for ts in F.transpose(sentence)]
class _Encode(Chain):
def __init__(self, nr_in, nr_out):
Chain.__init__(self,
fwd=L.LSTM(nr_in, nr_out),
bwd=L.LSTM(nr_in, nr_out),
mix=L.Bilinear(nr_out, nr_out, nr_out))
def __call__(self, sentence):
self.fwd.reset_state()
fwds = map(self.fwd, sentence)
self.bwd.reset_state()
bwds = reversed(map(self.bwd, reversed(sentence)))
return [F.elu(self.mix(f, b)) for f, b in zip(fwds, bwds)]
class _Attend(Chain):
def __init__(self, nr_in, nr_out):
Chain.__init__(self)
def __call__(self, sentence):
sent = sum(sentence)
return sent
class _Predict(Chain):
def __init__(self, nr_in, nr_out):
Chain.__init__(self,
l1=L.Linear(nr_in, nr_in),
l2=L.Linear(nr_in, nr_out))
def __call__(self, vector):
vector = self.l1(vector)
vector = F.elu(vector)
vector = self.l2(vector)
return vector
class SentenceDataset(TupleDataset):
def __init__(self, nlp, texts, labels, max_length):
self.max_length = max_length
sents, labels = self._get_labelled_sentences(
nlp.pipe(texts, batch_size=5000, n_threads=3),
labels)
TupleDataset.__init__(self,
get_features(sents, max_length),
labels)
def __getitem__(self, index):
batches = [dataset[index] for dataset in self._datasets]
if isinstance(index, slice):
length = len(batches[0])
returns = [tuple([batch[i] for batch in batches])
for i in six.moves.range(length)]
return returns
else:
return tuple(batches)
def _get_labelled_sentences(self, docs, doc_labels):
labels = []
sentences = []
for doc, y in izip(docs, doc_labels):
for sent in doc.sents:
sentences.append(sent)
labels.append(y)
return sentences, xp.asarray(labels, dtype='i')
class DocDataset(TupleDataset):
def __init__(self, nlp, texts, labels):
self.max_length = max_length
DatasetMixin.__init__(self,
get_features(
nlp.pipe(texts, batch_size=5000, n_threads=3), self.max_length),
labels)
def read_data(data_dir, limit=0):
examples = []
for subdir, label in (('pos', 1), ('neg', 0)):
for filename in (data_dir / subdir).iterdir():
with filename.open() as file_:
text = file_.read()
examples.append((text, label))
random.shuffle(examples)
if limit >= 1:
examples = examples[:limit]
return zip(*examples) # Unzips into two lists
def get_features(docs, max_length):
docs = list(docs)
Xs = xp.zeros((len(docs), max_length), dtype='i')
for i, doc in enumerate(docs):
j = 0
for token in doc:
if token.has_vector and not token.is_punct and not token.is_space:
Xs[i, j] = token.norm
j += 1
if j >= max_length:
break
return Xs
def set_vectors(vectors, vocab):
for lex in vocab:
if lex.has_vector and (lex.rank+1) < vectors.shape[0]:
lex.norm = lex.rank+1
vectors[lex.rank + 1] = lex.vector
else:
lex.norm = 0
return vectors
def train(train_texts, train_labels, dev_texts, dev_labels,
lstm_shape, lstm_settings, lstm_optimizer, batch_size=100, nb_epoch=5,
by_sentence=True):
nlp = spacy.load('en', entity=False)
if 'nr_vector' not in lstm_shape:
lstm_shape['nr_vector'] = max(lex.rank+1 for lex in nlp.vocab if lex.has_vector)
if 'nr_dim' not in lstm_shape:
lstm_shape['nr_dim'] = nlp.vocab.vectors_length
print("Make model")
model = Classifier(SentimentModel(nlp, lstm_shape, **lstm_settings))
print("Parsing texts...")
if by_sentence:
train_data = SentenceDataset(nlp, train_texts, train_labels, lstm_shape['max_length'])
dev_data = SentenceDataset(nlp, dev_texts, dev_labels, lstm_shape['max_length'])
else:
train_data = DocDataset(nlp, train_texts, train_labels)
dev_data = DocDataset(nlp, dev_texts, dev_labels)
train_iter = SerialIterator(train_data, batch_size=batch_size,
shuffle=True, repeat=True)
dev_iter = SerialIterator(dev_data, batch_size=batch_size,
shuffle=False, repeat=False)
optimizer = chainer.optimizers.Adam()
optimizer.setup(model)
updater = chainer.training.StandardUpdater(train_iter, optimizer, device=0)
trainer = chainer.training.Trainer(updater, (1, 'epoch'), out='result')
trainer.extend(extensions.Evaluator(dev_iter, model, device=0))
trainer.extend(extensions.LogReport())
trainer.extend(extensions.PrintReport([
'epoch', 'main/accuracy', 'validation/main/accuracy']))
trainer.extend(extensions.ProgressBar())
trainer.run()
def evaluate(model_dir, texts, labels, max_length=100):
def create_pipeline(nlp):
'''
This could be a lambda, but named functions are easier to read in Python.
'''
return [nlp.tagger, nlp.parser, SentimentAnalyser.load(model_dir, nlp,
max_length=max_length)]
nlp = spacy.load('en')
nlp.pipeline = create_pipeline(nlp)
correct = 0
i = 0
for doc in nlp.pipe(texts, batch_size=1000, n_threads=4):
correct += bool(doc.sentiment >= 0.5) == bool(labels[i])
i += 1
return float(correct) / i
@plac.annotations(
train_dir=("Location of training file or directory"),
dev_dir=("Location of development file or directory"),
model_dir=("Location of output model directory",),
is_runtime=("Demonstrate run-time usage", "flag", "r", bool),
nr_hidden=("Number of hidden units", "option", "H", int),
max_length=("Maximum sentence length", "option", "L", int),
dropout=("Dropout", "option", "d", float),
learn_rate=("Learn rate", "option", "e", float),
nb_epoch=("Number of training epochs", "option", "i", int),
batch_size=("Size of minibatches for training LSTM", "option", "b", int),
nr_examples=("Limit to N examples", "option", "n", int)
)
def main(model_dir, train_dir, dev_dir,
is_runtime=False,
nr_hidden=64, max_length=100, # Shape
dropout=0.5, learn_rate=0.001, # General NN config
nb_epoch=5, batch_size=32, nr_examples=-1): # Training params
model_dir = pathlib.Path(model_dir)
train_dir = pathlib.Path(train_dir)
dev_dir = pathlib.Path(dev_dir)
if is_runtime:
dev_texts, dev_labels = read_data(dev_dir)
acc = evaluate(model_dir, dev_texts, dev_labels, max_length=max_length)
print(acc)
else:
print("Read data")
train_texts, train_labels = read_data(train_dir, limit=nr_examples)
dev_texts, dev_labels = read_data(dev_dir, limit=nr_examples)
print("Using GPU 0")
#chainer.cuda.get_device(0).use()
train_labels = xp.asarray(train_labels, dtype='i')
dev_labels = xp.asarray(dev_labels, dtype='i')
lstm = train(train_texts, train_labels, dev_texts, dev_labels,
{'nr_hidden': nr_hidden, 'max_length': max_length, 'nr_class': 2,
'nr_vector': 5000},
{'dropout': 0.5, 'lr': learn_rate},
{},
nb_epoch=nb_epoch, batch_size=batch_size)
if __name__ == '__main__':
#cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof")
#s = pstats.Stats("Profile.prof")
#s.strip_dirs().sort_stats("time").print_stats()
plac.call(main)

View File

@ -20,71 +20,71 @@ The algorithm is O(n) at run-time for document of length n because we're only ev
matching over the tag patterns. So no matter how many phrases we're looking for, matching over the tag patterns. So no matter how many phrases we're looking for,
our pattern set stays very small (exact size depends on the maximum length we're our pattern set stays very small (exact size depends on the maximum length we're
looking for, as the query language currently has no quantifiers) looking for, as the query language currently has no quantifiers)
The example expects a .bz2 file from the Reddit corpus, and a patterns file,
formatted in jsonl as a sequence of entries like this:
{"text":"Anchorage"}
{"text":"Angola"}
{"text":"Ann Arbor"}
{"text":"Annapolis"}
{"text":"Appalachia"}
{"text":"Argentina"}
""" """
from __future__ import print_function, unicode_literals, division from __future__ import print_function, unicode_literals, division
from ast import literal_eval
from bz2 import BZ2File from bz2 import BZ2File
import time import time
import math import math
import codecs import codecs
import plac import plac
import ujson
from preshed.maps import PreshMap
from preshed.counter import PreshCounter
from spacy.strings import hash_string
from spacy.en import English
from spacy.matcher import PhraseMatcher from spacy.matcher import PhraseMatcher
import spacy
def read_gazetteer(tokenizer, loc, n=-1): def read_gazetteer(tokenizer, loc, n=-1):
for i, line in enumerate(open(loc)): for i, line in enumerate(open(loc)):
phrase = literal_eval('u' + line.strip()) data = ujson.loads(line.strip())
if ' (' in phrase and phrase.endswith(')'): phrase = tokenizer(data['text'])
phrase = phrase.split(' (', 1)[0] for w in phrase:
if i >= n: _ = tokenizer.vocab[w.text]
break
phrase = tokenizer(phrase)
if all((t.is_lower and t.prob >= -10) for t in phrase):
continue
if len(phrase) >= 2: if len(phrase) >= 2:
yield phrase yield phrase
def read_text(bz2_loc): def read_text(bz2_loc, n=10000):
with BZ2File(bz2_loc) as file_: with BZ2File(bz2_loc) as file_:
for line in file_: for i, line in enumerate(file_):
yield line.decode('utf8') data = ujson.loads(line)
yield data['body']
if i >= n:
break
def get_matches(tokenizer, phrases, texts, max_length=6): def get_matches(tokenizer, phrases, texts, max_length=6):
matcher = PhraseMatcher(tokenizer.vocab, phrases, max_length=max_length) matcher = PhraseMatcher(tokenizer.vocab, max_length=max_length)
print("Match") matcher.add('Phrase', None, *phrases)
for text in texts: for text in texts:
doc = tokenizer(text) doc = tokenizer(text)
for w in doc:
_ = doc.vocab[w.text]
matches = matcher(doc) matches = matcher(doc)
for mwe in doc.ents: for ent_id, start, end in matches:
yield mwe yield (ent_id, doc[start:end].text)
def main(patterns_loc, text_loc, counts_loc, n=10000000): def main(patterns_loc, text_loc, n=10000):
nlp = English(parser=False, tagger=False, entity=False) nlp = spacy.blank('en')
print("Make matcher") nlp.vocab.lex_attr_getters = {}
phrases = read_gazetteer(nlp.tokenizer, patterns_loc, n=n) phrases = read_gazetteer(nlp.tokenizer, patterns_loc)
counts = PreshCounter() count = 0
t1 = time.time() t1 = time.time()
for mwe in get_matches(nlp.tokenizer, phrases, read_text(text_loc)): for ent_id, text in get_matches(nlp.tokenizer, phrases, read_text(text_loc, n=n)):
counts.inc(hash_string(mwe.text), 1) count += 1
t2 = time.time() t2 = time.time()
print("10m tokens in %d s" % (t2 - t1)) print("%d docs in %.3f s. %d matches" % (n, (t2 - t1), count))
with codecs.open(counts_loc, 'w', 'utf8') as file_:
for phrase in read_gazetteer(nlp.tokenizer, patterns_loc, n=n):
text = phrase.string
key = hash_string(text)
count = counts[key]
if count != 0:
file_.write('%d\t%s\n' % (count, text))
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -13,24 +13,29 @@ Input data:
https://www.lt.informatik.tu-darmstadt.de/fileadmin/user_upload/Group_LangTech/data/GermEval2014_complete_data.zip https://www.lt.informatik.tu-darmstadt.de/fileadmin/user_upload/Group_LangTech/data/GermEval2014_complete_data.zip
Developed for: spaCy 1.7.1 Developed for: spaCy 1.7.1
Last tested for: spaCy 1.7.1 Last tested for: spaCy 2.0.0a13
''' '''
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
import plac import plac
from pathlib import Path from pathlib import Path
import random import random
import json import json
import tqdm
from thinc.neural.optimizers import Adam
from thinc.neural.ops import NumpyOps
import spacy.orth as orth_funcs
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.pipeline import BeamEntityRecognizer from spacy.pipeline import TokenVectorEncoder, NeuralEntityRecognizer
from spacy.pipeline import EntityRecognizer
from spacy.tokenizer import Tokenizer from spacy.tokenizer import Tokenizer
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.attrs import * from spacy.attrs import *
from spacy.gold import GoldParse from spacy.gold import GoldParse
from spacy.gold import _iob_to_biluo as iob_to_biluo from spacy.gold import iob_to_biluo
from spacy.gold import minibatch
from spacy.scorer import Scorer from spacy.scorer import Scorer
import spacy.util
try: try:
unicode unicode
@ -38,96 +43,38 @@ except NameError:
unicode = str unicode = str
spacy.util.set_env_log(True)
def init_vocab(): def init_vocab():
return Vocab( return Vocab(
lex_attr_getters={ lex_attr_getters={
LOWER: lambda string: string.lower(), LOWER: lambda string: string.lower(),
SHAPE: orth_funcs.word_shape, NORM: lambda string: string.lower(),
PREFIX: lambda string: string[0], PREFIX: lambda string: string[0],
SUFFIX: lambda string: string[-3:], SUFFIX: lambda string: string[-3:],
CLUSTER: lambda string: 0,
IS_ALPHA: orth_funcs.is_alpha,
IS_ASCII: orth_funcs.is_ascii,
IS_DIGIT: lambda string: string.isdigit(),
IS_LOWER: orth_funcs.is_lower,
IS_PUNCT: orth_funcs.is_punct,
IS_SPACE: lambda string: string.isspace(),
IS_TITLE: orth_funcs.is_title,
IS_UPPER: orth_funcs.is_upper,
IS_STOP: lambda string: False,
IS_OOV: lambda string: True
}) })
def save_vocab(vocab, path):
path = Path(path)
if not path.exists():
path.mkdir()
elif not path.is_dir():
raise IOError("Can't save vocab to %s\nNot a directory" % path)
with (path / 'strings.json').open('w') as file_:
vocab.strings.dump(file_)
vocab.dump((path / 'lexemes.bin').as_posix())
def load_vocab(path):
path = Path(path)
if not path.exists():
raise IOError("Cannot load vocab from %s\nDoes not exist" % path)
if not path.is_dir():
raise IOError("Cannot load vocab from %s\nNot a directory" % path)
return Vocab.load(path)
def init_ner_model(vocab, features=None):
if features is None:
features = tuple(EntityRecognizer.feature_templates)
return EntityRecognizer(vocab, features=features)
def save_ner_model(model, path):
path = Path(path)
if not path.exists():
path.mkdir()
if not path.is_dir():
raise IOError("Can't save model to %s\nNot a directory" % path)
model.model.dump((path / 'model').as_posix())
with (path / 'config.json').open('w') as file_:
data = json.dumps(model.cfg)
if not isinstance(data, unicode):
data = data.decode('utf8')
file_.write(data)
def load_ner_model(vocab, path):
return EntityRecognizer.load(path, vocab)
class Pipeline(object): class Pipeline(object):
@classmethod
def load(cls, path):
path = Path(path)
if not path.exists():
raise IOError("Cannot load pipeline from %s\nDoes not exist" % path)
if not path.is_dir():
raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
vocab = load_vocab(path)
tokenizer = Tokenizer(vocab, {}, None, None, None)
ner_model = load_ner_model(vocab, path / 'ner')
return cls(vocab, tokenizer, ner_model)
def __init__(self, vocab=None, tokenizer=None, entity=None): def __init__(self, vocab=None, tokenizer=None, entity=None):
if vocab is None: if vocab is None:
vocab = init_vocab() vocab = init_vocab()
if tokenizer is None: if tokenizer is None:
tokenizer = Tokenizer(vocab, {}, None, None, None) tokenizer = Tokenizer(vocab, {}, None, None, None)
if entity is None: if entity is None:
entity = init_ner_model(self.vocab) entity = NeuralEntityRecognizer(vocab)
self.vocab = vocab self.vocab = vocab
self.tokenizer = tokenizer self.tokenizer = tokenizer
self.entity = entity self.entity = entity
self.pipeline = [self.entity] self.pipeline = [self.entity]
def begin_training(self):
for model in self.pipeline:
model.begin_training([])
optimizer = Adam(NumpyOps(), 0.001)
return optimizer
def __call__(self, input_): def __call__(self, input_):
doc = self.make_doc(input_) doc = self.make_doc(input_)
for process in self.pipeline: for process in self.pipeline:
@ -147,14 +94,16 @@ class Pipeline(object):
gold = GoldParse(doc, entities=annotations) gold = GoldParse(doc, entities=annotations)
return gold return gold
def update(self, input_, annot): def update(self, inputs, annots, sgd, losses=None, drop=0.):
doc = self.make_doc(input_) if losses is None:
gold = self.make_gold(input_, annot) losses = {}
for ner in gold.ner: docs = [self.make_doc(input_) for input_ in inputs]
if ner not in (None, '-', 'O'): golds = [self.make_gold(input_, annot) for input_, annot in
action, label = ner.split('-', 1) zip(inputs, annots)]
self.entity.add_label(label)
return self.entity.update(doc, gold) self.entity.update(docs, golds, drop=drop,
sgd=sgd, losses=losses)
return losses
def evaluate(self, examples): def evaluate(self, examples):
scorer = Scorer() scorer = Scorer()
@ -164,34 +113,36 @@ class Pipeline(object):
scorer.score(doc, gold) scorer.score(doc, gold)
return scorer.scores return scorer.scores
def average_weights(self): def to_disk(self, path):
self.entity.model.end_training()
def save(self, path):
path = Path(path) path = Path(path)
if not path.exists(): if not path.exists():
path.mkdir() path.mkdir()
elif not path.is_dir(): elif not path.is_dir():
raise IOError("Can't save pipeline to %s\nNot a directory" % path) raise IOError("Can't save pipeline to %s\nNot a directory" % path)
save_vocab(self.vocab, path / 'vocab') self.vocab.to_disk(path / 'vocab')
save_ner_model(self.entity, path / 'ner') self.entity.to_disk(path / 'ner')
def from_disk(self, path):
path = Path(path)
if not path.exists():
raise IOError("Cannot load pipeline from %s\nDoes not exist" % path)
if not path.is_dir():
raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
self.vocab = self.vocab.from_disk(path / 'vocab')
self.entity = self.entity.from_disk(path / 'ner')
def train(nlp, train_examples, dev_examples, ctx, nr_epoch=5): def train(nlp, train_examples, dev_examples, nr_epoch=5):
next_epoch = train_examples sgd = nlp.begin_training()
print("Iter", "Loss", "P", "R", "F") print("Iter", "Loss", "P", "R", "F")
for i in range(nr_epoch): for i in range(nr_epoch):
this_epoch = next_epoch random.shuffle(train_examples)
next_epoch = [] losses = {}
loss = 0 for batch in minibatch(tqdm.tqdm(train_examples, leave=False), size=8):
for input_, annot in this_epoch: inputs, annots = zip(*batch)
loss += nlp.update(input_, annot) nlp.update(list(inputs), list(annots), sgd, losses=losses)
if (i+1) < nr_epoch:
next_epoch.append((input_, annot))
random.shuffle(next_epoch)
scores = nlp.evaluate(dev_examples) scores = nlp.evaluate(dev_examples)
report_scores(i, loss, scores) report_scores(i, losses['ner'], scores)
nlp.average_weights()
scores = nlp.evaluate(dev_examples) scores = nlp.evaluate(dev_examples)
report_scores(channels, i+1, loss, scores) report_scores(channels, i+1, loss, scores)
@ -208,7 +159,8 @@ def read_examples(path):
with path.open() as file_: with path.open() as file_:
sents = file_.read().strip().split('\n\n') sents = file_.read().strip().split('\n\n')
for sent in sents: for sent in sents:
if not sent.strip(): sent = sent.strip()
if not sent:
continue continue
tokens = sent.split('\n') tokens = sent.split('\n')
while tokens and tokens[0].startswith('#'): while tokens and tokens[0].startswith('#'):
@ -217,28 +169,39 @@ def read_examples(path):
iob = [] iob = []
for token in tokens: for token in tokens:
if token.strip(): if token.strip():
pieces = token.split() pieces = token.split('\t')
words.append(pieces[1]) words.append(pieces[1])
iob.append(pieces[2]) iob.append(pieces[2])
yield words, iob_to_biluo(iob) yield words, iob_to_biluo(iob)
def get_labels(examples):
labels = set()
for words, tags in examples:
for tag in tags:
if '-' in tag:
labels.add(tag.split('-')[1])
return sorted(labels)
@plac.annotations( @plac.annotations(
model_dir=("Path to save the model", "positional", None, Path), model_dir=("Path to save the model", "positional", None, Path),
train_loc=("Path to your training data", "positional", None, Path), train_loc=("Path to your training data", "positional", None, Path),
dev_loc=("Path to your development data", "positional", None, Path), dev_loc=("Path to your development data", "positional", None, Path),
) )
def main(model_dir=Path('/home/matt/repos/spaCy/spacy/data/de-1.0.0'), def main(model_dir, train_loc, dev_loc, nr_epoch=30):
train_loc=None, dev_loc=None, nr_epoch=30): print(model_dir, train_loc, dev_loc)
train_examples = list(read_examples(train_loc))
train_examples = read_examples(train_loc)
dev_examples = read_examples(dev_loc) dev_examples = read_examples(dev_loc)
nlp = Pipeline.load(model_dir) nlp = Pipeline()
for label in get_labels(train_examples):
nlp.entity.add_label(label)
print("Add label", label)
train(nlp, train_examples, list(dev_examples), ctx, nr_epoch) train(nlp, train_examples, list(dev_examples), nr_epoch)
nlp.save(model_dir) nlp.to_disk(model_dir)
if __name__ == '__main__': if __name__ == '__main__':
main() plac.call(main)

View File

@ -25,7 +25,7 @@ For more details, see the documentation:
* Saving and loading models: https://spacy.io/docs/usage/saving-loading * Saving and loading models: https://spacy.io/docs/usage/saving-loading
Developed for: spaCy 1.7.6 Developed for: spaCy 1.7.6
Last tested for: spaCy 1.7.6 Last updated for: spaCy 2.0.0a13
""" """
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
@ -34,55 +34,41 @@ from pathlib import Path
import random import random
import spacy import spacy
from spacy.gold import GoldParse from spacy.gold import GoldParse, minibatch
from spacy.tagger import Tagger from spacy.pipeline import NeuralEntityRecognizer
from spacy.pipeline import TokenVectorEncoder
def get_gold_parses(tokenizer, train_data):
'''Shuffle and create GoldParse objects'''
random.shuffle(train_data)
for raw_text, entity_offsets in train_data:
doc = tokenizer(raw_text)
gold = GoldParse(doc, entities=entity_offsets)
yield doc, gold
def train_ner(nlp, train_data, output_dir): def train_ner(nlp, train_data, output_dir):
# Add new words to vocab
for raw_text, _ in train_data:
doc = nlp.make_doc(raw_text)
for word in doc:
_ = nlp.vocab[word.orth]
random.seed(0) random.seed(0)
# You may need to change the learning rate. It's generally difficult to optimizer = nlp.begin_training(lambda: [])
# guess what rate you should set, especially when you have limited data. nlp.meta['name'] = 'en_ent_animal'
nlp.entity.model.learn_rate = 0.001 for itn in range(50):
for itn in range(1000): losses = {}
random.shuffle(train_data) for batch in minibatch(get_gold_parses(nlp.make_doc, train_data), size=3):
loss = 0. docs, golds = zip(*batch)
for raw_text, entity_offsets in train_data: nlp.update(docs, golds, losses=losses, sgd=optimizer, update_shared=True,
gold = GoldParse(doc, entities=entity_offsets) drop=0.35)
# By default, the GoldParse class assumes that the entities print(losses)
# described by offset are complete, and all other words should if not output_dir:
# have the tag 'O'. You can tell it to make no assumptions return
# about the tag of a word by giving it the tag '-'. elif not output_dir.exists():
# However, this allows a trivial solution to the current output_dir.mkdir()
# learning problem: if words are either 'any tag' or 'ANIMAL', nlp.to_disk(output_dir)
# the model can learn that all words can be tagged 'ANIMAL'.
#for i in range(len(gold.ner)):
#if not gold.ner[i].endswith('ANIMAL'):
# gold.ner[i] = '-'
doc = nlp.make_doc(raw_text)
nlp.tagger(doc)
# As of 1.9, spaCy's parser now lets you supply a dropout probability
# This might help the model generalize better from only a few
# examples.
loss += nlp.entity.update(doc, gold, drop=0.9)
if loss == 0:
break
# This step averages the model's weights. This may or may not be good for
# your situation --- it's empirical.
nlp.end_training()
if output_dir:
if not output_dir.exists():
output_dir.mkdir()
nlp.save_to_directory(output_dir)
def main(model_name, output_directory=None): def main(model_name, output_directory=None):
print("Loading initial model", model_name) print("Creating initial model", model_name)
nlp = spacy.load(model_name) nlp = spacy.blank(model_name)
if output_directory is not None: if output_directory is not None:
output_directory = Path(output_directory) output_directory = Path(output_directory)
@ -91,6 +77,11 @@ def main(model_name, output_directory=None):
"Horses are too tall and they pretend to care about your feelings", "Horses are too tall and they pretend to care about your feelings",
[(0, 6, 'ANIMAL')], [(0, 6, 'ANIMAL')],
), ),
(
"Do they bite?",
[],
),
( (
"horses are too tall and they pretend to care about your feelings", "horses are too tall and they pretend to care about your feelings",
[(0, 6, 'ANIMAL')] [(0, 6, 'ANIMAL')]
@ -109,18 +100,20 @@ def main(model_name, output_directory=None):
) )
] ]
nlp.entity.add_label('ANIMAL') nlp.pipeline.append(TokenVectorEncoder(nlp.vocab))
nlp.pipeline.append(NeuralEntityRecognizer(nlp.vocab))
nlp.pipeline[-1].add_label('ANIMAL')
train_ner(nlp, train_data, output_directory) train_ner(nlp, train_data, output_directory)
# Test that the entity is recognized # Test that the entity is recognized
doc = nlp('Do you like horses?') text = 'Do you like horses?'
print("Ents in 'Do you like horses?':") print("Ents in 'Do you like horses?':")
doc = nlp(text)
for ent in doc.ents: for ent in doc.ents:
print(ent.label_, ent.text) print(ent.label_, ent.text)
if output_directory: if output_directory:
print("Loading from", output_directory) print("Loading from", output_directory)
nlp2 = spacy.load('en', path=output_directory) nlp2 = spacy.load(output_directory)
nlp2.entity.add_label('ANIMAL')
doc2 = nlp2('Do you like horses?') doc2 = nlp2('Do you like horses?')
for ent in doc2.ents: for ent in doc2.ents:
print(ent.label_, ent.text) print(ent.label_, ent.text)

View File

@ -1,3 +1,7 @@
'''Train a multi-label convolutional neural network text classifier,
using the spacy.pipeline.TextCategorizer component. The model is then added
to spacy.pipeline, and predictions are available at `doc.cats`.
'''
from __future__ import unicode_literals from __future__ import unicode_literals
import plac import plac
import random import random
@ -12,6 +16,11 @@ from spacy.gold import GoldParse, minibatch
from spacy.util import compounding from spacy.util import compounding
from spacy.pipeline import TextCategorizer from spacy.pipeline import TextCategorizer
# TODO: Remove this once we're not supporting models trained with thinc <6.9.0
import thinc.neural._classes.layernorm
thinc.neural._classes.layernorm.set_compat_six_eight(False)
def train_textcat(tokenizer, textcat, def train_textcat(tokenizer, textcat,
train_texts, train_cats, dev_texts, dev_cats, train_texts, train_cats, dev_texts, dev_cats,
@ -24,14 +33,15 @@ def train_textcat(tokenizer, textcat,
train_docs = [tokenizer(text) for text in train_texts] train_docs = [tokenizer(text) for text in train_texts]
train_gold = [GoldParse(doc, cats=cats) for doc, cats in train_gold = [GoldParse(doc, cats=cats) for doc, cats in
zip(train_docs, train_cats)] zip(train_docs, train_cats)]
train_data = zip(train_docs, train_gold) train_data = list(zip(train_docs, train_gold))
batch_sizes = compounding(4., 128., 1.001) batch_sizes = compounding(4., 128., 1.001)
for i in range(n_iter): for i in range(n_iter):
losses = {} losses = {}
train_data = tqdm.tqdm(train_data, leave=False) # Progress bar # Progress bar and minibatching
for batch in minibatch(train_data, size=batch_sizes): batches = minibatch(tqdm.tqdm(train_data, leave=False), size=batch_sizes)
for batch in batches:
docs, golds = zip(*batch) docs, golds = zip(*batch)
textcat.update((docs, None), golds, sgd=optimizer, drop=0.2, textcat.update(docs, golds, sgd=optimizer, drop=0.2,
losses=losses) losses=losses)
with textcat.model.use_params(optimizer.averages): with textcat.model.use_params(optimizer.averages):
scores = evaluate(tokenizer, textcat, dev_texts, dev_cats) scores = evaluate(tokenizer, textcat, dev_texts, dev_cats)
@ -61,12 +71,13 @@ def evaluate(tokenizer, textcat, texts, cats):
return {'textcat_p': precis, 'textcat_r': recall, 'textcat_f': fscore} return {'textcat_p': precis, 'textcat_r': recall, 'textcat_f': fscore}
def load_data(): def load_data(limit=0):
# Partition off part of the train data --- avoid running experiments # Partition off part of the train data --- avoid running experiments
# against test. # against test.
train_data, _ = thinc.extra.datasets.imdb() train_data, _ = thinc.extra.datasets.imdb()
random.shuffle(train_data) random.shuffle(train_data)
train_data = train_data[-limit:]
texts, labels = zip(*train_data) texts, labels = zip(*train_data)
cats = [(['POSITIVE'] if y else []) for y in labels] cats = [(['POSITIVE'] if y else []) for y in labels]
@ -86,7 +97,7 @@ def main(model_loc=None):
textcat = TextCategorizer(tokenizer.vocab, labels=['POSITIVE']) textcat = TextCategorizer(tokenizer.vocab, labels=['POSITIVE'])
print("Load IMDB data") print("Load IMDB data")
(train_texts, train_cats), (dev_texts, dev_cats) = load_data() (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=1000)
print("Itn.\tLoss\tP\tR\tF") print("Itn.\tLoss\tP\tR\tF")
progress = '{i:d} {loss:.3f} {textcat_p:.3f} {textcat_r:.3f} {textcat_f:.3f}' progress = '{i:d} {loss:.3f} {textcat_p:.3f} {textcat_r:.3f} {textcat_f:.3f}'

View File

@ -0,0 +1,30 @@
'''Load vectors for a language trained using FastText
https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
'''
from __future__ import unicode_literals
import plac
import numpy
import spacy.language
def main(vectors_loc):
nlp = spacy.language.Language()
with open(vectors_loc, 'rb') as file_:
header = file_.readline()
nr_row, nr_dim = header.split()
nlp.vocab.clear_vectors(int(nr_dim))
for line in file_:
line = line.decode('utf8')
pieces = line.split()
word = pieces[0]
vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
nlp.vocab.set_vector(word, vector)
doc = nlp(u'class colspan')
print(doc[0].similarity(doc[1]))
if __name__ == '__main__':
plac.call(main)

5
fabfile.py vendored
View File

@ -14,6 +14,7 @@ VENV_DIR = path.join(PWD, ENV)
def env(lang='python2.7'): def env(lang='python2.7'):
if path.exists(VENV_DIR): if path.exists(VENV_DIR):
local('rm -rf {env}'.format(env=VENV_DIR)) local('rm -rf {env}'.format(env=VENV_DIR))
local('pip install virtualenv')
local('python -m virtualenv -p {lang} {env}'.format(lang=lang, env=VENV_DIR)) local('python -m virtualenv -p {lang} {env}'.format(lang=lang, env=VENV_DIR))
@ -32,6 +33,10 @@ def make():
local('pip install -r requirements.txt') local('pip install -r requirements.txt')
local('python setup.py build_ext --inplace') local('python setup.py build_ext --inplace')
def sdist():
with virtualenv(VENV_DIR):
with lcd(path.dirname(__file__)):
local('python setup.py sdist')
def clean(): def clean():
with lcd(path.dirname(__file__)): with lcd(path.dirname(__file__)):

View File

@ -1,9 +1,9 @@
cython<0.24 cython>=0.24,<0.27.0
pathlib pathlib
numpy>=1.7 numpy>=1.7
cymem>=1.30,<1.32 cymem>=1.30,<1.32
preshed>=1.0.0,<2.0.0 preshed>=1.0.0,<2.0.0
thinc>=6.8.0,<6.9.0 thinc>=6.9.0,<6.10.0
murmurhash>=0.28,<0.29 murmurhash>=0.28,<0.29
plac<1.0.0,>=0.9.6 plac<1.0.0,>=0.9.6
six six
@ -13,7 +13,7 @@ requests>=2.13.0,<3.0.0
regex==2017.4.5 regex==2017.4.5
ftfy>=4.4.2,<5.0.0 ftfy>=4.4.2,<5.0.0
pytest>=3.0.6,<4.0.0 pytest>=3.0.6,<4.0.0
pip>=9.0.0,<10.0.0
mock>=2.0.0,<3.0.0 mock>=2.0.0,<3.0.0
msgpack-python msgpack-python
msgpack-numpy msgpack-numpy
html5lib==1.0b8

View File

@ -195,9 +195,8 @@ def setup_package():
'murmurhash>=0.28,<0.29', 'murmurhash>=0.28,<0.29',
'cymem>=1.30,<1.32', 'cymem>=1.30,<1.32',
'preshed>=1.0.0,<2.0.0', 'preshed>=1.0.0,<2.0.0',
'thinc>=6.8.0,<6.9.0', 'thinc>=6.9.0,<6.10.0',
'plac<1.0.0,>=0.9.6', 'plac<1.0.0,>=0.9.6',
'pip>=9.0.0,<10.0.0',
'six', 'six',
'pathlib', 'pathlib',
'ujson>=1.35', 'ujson>=1.35',

View File

@ -4,11 +4,13 @@ from __future__ import unicode_literals
from .cli.info import info as cli_info from .cli.info import info as cli_info
from .glossary import explain from .glossary import explain
from .deprecated import resolve_load_name from .deprecated import resolve_load_name
#from .about import __version__
from .about import __version__ from .about import __version__
from . import util from . import util
def load(name, **overrides): def load(name, **overrides):
from .deprecated import resolve_load_name
name = resolve_load_name(name, **overrides) name = resolve_load_name(name, **overrides)
return util.load_model(name, **overrides) return util.load_model(name, **overrides)

View File

@ -7,7 +7,7 @@ if __name__ == '__main__':
import plac import plac
import sys import sys
from spacy.cli import download, link, info, package, train, convert, model from spacy.cli import download, link, info, package, train, convert, model
from spacy.cli import profile from spacy.cli import profile, evaluate
from spacy.util import prints from spacy.util import prints
commands = { commands = {
@ -15,6 +15,7 @@ if __name__ == '__main__':
'link': link, 'link': link,
'info': info, 'info': info,
'train': train, 'train': train,
'evaluate': evaluate,
'convert': convert, 'convert': convert,
'package': package, 'package': package,
'model': model, 'model': model,

View File

@ -1,28 +1,27 @@
import ujson import ujson
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
from thinc.i2v import HashEmbed, StaticVectors
from thinc.t2t import ExtractWindow, ParametricAttention
from thinc.t2v import Pooling, max_pool, mean_pool, sum_pool
from thinc.misc import Residual
from thinc.misc import BatchNorm as BN
from thinc.misc import LayerNorm as LN
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
from thinc.neural import Model, Maxout, Softmax, Affine from thinc.api import FeatureExtracter, with_getitem
from thinc.neural._classes.hash_embed import HashEmbed from thinc.api import uniqued, wrap, flatten_add_lengths, noop
from thinc.linear.linear import LinearModel
from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.util import get_array_module from thinc.neural.util import get_array_module
import random import random
import cytoolz import cytoolz
from thinc.neural._classes.convolution import ExtractWindow
from thinc.neural._classes.static_vectors import StaticVectors
from thinc.neural._classes.batchnorm import BatchNorm as BN
from thinc.neural._classes.layernorm import LayerNorm as LN
from thinc.neural._classes.resnet import Residual
from thinc.neural import ReLu
from thinc.neural._classes.selu import SELU
from thinc import describe from thinc import describe
from thinc.describe import Dimension, Synapses, Biases, Gradient from thinc.describe import Dimension, Synapses, Biases, Gradient
from thinc.neural._classes.affine import _set_dimensions_if_needed from thinc.neural._classes.affine import _set_dimensions_if_needed
from thinc.api import FeatureExtracter, with_getitem import thinc.extra.load_nlp
from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool
from thinc.neural._classes.attention import ParametricAttention
from thinc.linear.linear import LinearModel
from thinc.api import uniqued, wrap, flatten_add_lengths
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP, CLUSTER from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP, CLUSTER
from .tokens.doc import Doc from .tokens.doc import Doc
@ -31,6 +30,11 @@ from . import util
import numpy import numpy
import io import io
# TODO: Unset this once we don't want to support models previous models.
import thinc.neural._classes.layernorm
thinc.neural._classes.layernorm.set_compat_six_eight(True)
VECTORS_KEY = 'spacy_pretrained_vectors'
@layerize @layerize
def _flatten_add_lengths(seqs, pad=0, drop=0.): def _flatten_add_lengths(seqs, pad=0, drop=0.):
@ -225,33 +229,80 @@ def drop_layer(layer, factor=2.):
model.predict = layer model.predict = layer
return model return model
def link_vectors_to_models(vocab):
vectors = vocab.vectors
ops = Model.ops
for word in vocab:
if word.orth in vectors.key2row:
word.rank = vectors.key2row[word.orth]
else:
word.rank = 0
data = ops.asarray(vectors.data)
# Set an entry here, so that vectors are accessed by StaticVectors
# (unideal, I know)
thinc.extra.load_nlp.VECTORS[(ops.device, VECTORS_KEY)] = data
def Tok2Vec(width, embed_size, preprocess=None): def Tok2Vec(width, embed_size, **kwargs):
pretrained_dims = kwargs.get('pretrained_dims', 0)
cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 3)
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}): with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add,
'*': reapply}):
norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm') norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm')
prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix') prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix')
suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix') suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix')
shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape') shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape')
if pretrained_dims is not None and pretrained_dims >= 1:
glove = StaticVectors(VECTORS_KEY, width, column=cols.index(ID))
embed = (norm | prefix | suffix | shape ) >> LN(Maxout(width, width*4, pieces=3)) embed = uniqued(
tok2vec = ( (glove | norm | prefix | suffix | shape)
with_flatten( >> LN(Maxout(width, width*5, pieces=3)), column=5)
asarray(Model.ops, dtype='uint64') else:
>> uniqued(embed, column=5) embed = uniqued(
>> Residual( (norm | prefix | suffix | shape)
(ExtractWindow(nW=1) >> LN(Maxout(width, width*3))) >> LN(Maxout(width, width*4, pieces=3)), column=5)
) ** 4, pad=4
)
convolution = Residual(
ExtractWindow(nW=1)
>> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces))
) )
if preprocess not in (False, None):
tok2vec = preprocess >> tok2vec tok2vec = (
FeatureExtracter(cols)
>> with_flatten(
embed >> (convolution ** 4), pad=4)
)
# Work around thinc API limitations :(. TODO: Revise in Thinc 7 # Work around thinc API limitations :(. TODO: Revise in Thinc 7
tok2vec.nO = width tok2vec.nO = width
tok2vec.embed = embed tok2vec.embed = embed
return tok2vec return tok2vec
def reapply(layer, n_times):
def reapply_fwd(X, drop=0.):
backprops = []
for i in range(n_times):
Y, backprop = layer.begin_update(X, drop=drop)
X = Y
backprops.append(backprop)
def reapply_bwd(dY, sgd=None):
dX = None
for backprop in reversed(backprops):
dY = backprop(dY, sgd=sgd)
if dX is None:
dX = dY
else:
dX += dY
return dX
return Y, reapply_bwd
return wrap(reapply_fwd, layer)
def asarray(ops, dtype): def asarray(ops, dtype):
def forward(X, drop=0.): def forward(X, drop=0.):
return ops.asarray(X, dtype=dtype), None return ops.asarray(X, dtype=dtype), None
@ -455,20 +506,25 @@ def getitem(i):
return X[i], None return X[i], None
return layerize(getitem_fwd) return layerize(getitem_fwd)
def build_tagger_model(nr_class, token_vector_width, **cfg): def build_tagger_model(nr_class, **cfg):
embed_size = util.env_opt('embed_size', 7500) embed_size = util.env_opt('embed_size', 7000)
if 'token_vector_width' in cfg:
token_vector_width = cfg['token_vector_width']
else:
token_vector_width = util.env_opt('token_vector_width', 128)
pretrained_dims = cfg.get('pretrained_dims', 0)
with Model.define_operators({'>>': chain, '+': add}): with Model.define_operators({'>>': chain, '+': add}):
# Input: (doc, tensor) tuples if 'tok2vec' in cfg:
private_tok2vec = Tok2Vec(token_vector_width, embed_size, preprocess=doc2feats()) tok2vec = cfg['tok2vec']
else:
tok2vec = Tok2Vec(token_vector_width, embed_size,
pretrained_dims=pretrained_dims)
model = ( model = (
fine_tune(private_tok2vec) tok2vec
>> with_flatten( >> with_flatten(Softmax(nr_class, token_vector_width))
Maxout(token_vector_width, token_vector_width)
>> Softmax(nr_class, token_vector_width)
)
) )
model.nI = None model.nI = None
model.tok2vec = tok2vec
return model return model
@ -514,6 +570,7 @@ def foreach(layer, drop_factor=1.0):
def build_text_classifier(nr_class, width=64, **cfg): def build_text_classifier(nr_class, width=64, **cfg):
nr_vector = cfg.get('nr_vector', 5000) nr_vector = cfg.get('nr_vector', 5000)
pretrained_dims = cfg.get('pretrained_dims', 0)
with Model.define_operators({'>>': chain, '+': add, '|': concatenate, with Model.define_operators({'>>': chain, '+': add, '|': concatenate,
'**': clone}): '**': clone}):
if cfg.get('low_data'): if cfg.get('low_data'):
@ -521,7 +578,7 @@ def build_text_classifier(nr_class, width=64, **cfg):
SpacyVectors SpacyVectors
>> flatten_add_lengths >> flatten_add_lengths
>> with_getitem(0, >> with_getitem(0,
Affine(width, 300) Affine(width, pretrained_dims)
) )
>> ParametricAttention(width) >> ParametricAttention(width)
>> Pooling(sum_pool) >> Pooling(sum_pool)
@ -548,18 +605,24 @@ def build_text_classifier(nr_class, width=64, **cfg):
) )
) )
static_vectors = ( if pretrained_dims:
SpacyVectors static_vectors = (
>> with_flatten(Affine(width, 300)) SpacyVectors
) >> with_flatten(Affine(width, pretrained_dims))
)
cnn_model = (
# TODO Make concatenate support lists # TODO Make concatenate support lists
concatenate_lists(trained_vectors, static_vectors) vectors = concatenate_lists(trained_vectors, static_vectors)
vectors_width = width*2
else:
vectors = trained_vectors
vectors_width = width
static_vectors = None
cnn_model = (
vectors
>> with_flatten( >> with_flatten(
LN(Maxout(width, width*2)) LN(Maxout(width, vectors_width))
>> Residual( >> Residual(
(ExtractWindow(nW=1) >> zero_init(Maxout(width, width*3))) (ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
) ** 2, pad=2 ) ** 2, pad=2
) )
>> flatten_add_lengths >> flatten_add_lengths
@ -579,7 +642,7 @@ def build_text_classifier(nr_class, width=64, **cfg):
>> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0)) >> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0))
>> logistic >> logistic
) )
model.nO = nr_class
model.lsuv = False model.lsuv = False
return model return model

View File

@ -3,14 +3,15 @@
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
__title__ = 'spacy-nightly' __title__ = 'spacy-nightly'
__version__ = '2.0.0a13' __version__ = '2.0.0a16'
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython' __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
__uri__ = 'https://spacy.io' __uri__ = 'https://spacy.io'
__author__ = 'Explosion AI' __author__ = 'Explosion AI'
__email__ = 'contact@explosion.ai' __email__ = 'contact@explosion.ai'
__license__ = 'MIT' __license__ = 'MIT'
__release__ = True
__docs_models__ = 'https://spacy.io/docs/usage/models' __docs_models__ = 'https://alpha.spacy.io/usage/models'
__download_url__ = 'https://github.com/explosion/spacy-models/releases/download' __download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
__compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json' __compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json'
__shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts.json' __shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts.json'

View File

@ -1,5 +1,5 @@
# Reserve 64 values for flag features # Reserve 64 values for flag features
cpdef enum attr_id_t: cdef enum attr_id_t:
NULL_ATTR NULL_ATTR
IS_ALPHA IS_ALPHA
IS_ASCII IS_ASCII

View File

@ -94,6 +94,7 @@ IDS = {
# ATTR IDs, in order of the symbol # ATTR IDs, in order of the symbol
NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])] NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
locals().update(IDS)
def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):

View File

@ -4,5 +4,6 @@ from .link import link
from .package import package from .package import package
from .profile import profile from .profile import profile
from .train import train from .train import train
from .evaluate import evaluate
from .convert import convert from .convert import convert
from .model import model from .model import model

View File

@ -14,7 +14,7 @@ from ..util import prints
CONVERTERS = { CONVERTERS = {
'.conllu': conllu2json, '.conllu': conllu2json,
'.conll': conllu2json, '.conll': conllu2json,
'.iob': iob2json '.iob': iob2json,
} }

View File

@ -1,5 +1,6 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from cytoolz import partition_all, concat
from ...compat import json_dumps, path2str from ...compat import json_dumps, path2str
from ...util import prints from ...util import prints
@ -10,11 +11,9 @@ def iob2json(input_path, output_path, n_sents=10, *a, **k):
""" """
Convert IOB files into JSON format for use with train cli. Convert IOB files into JSON format for use with train cli.
""" """
# TODO: This isn't complete yet -- need to map from IOB to
# BILUO
with input_path.open('r', encoding='utf8') as file_: with input_path.open('r', encoding='utf8') as file_:
docs = read_iob(file_) sentences = read_iob(file_)
docs = merge_sentences(sentences, n_sents)
output_filename = input_path.parts[-1].replace(".iob", ".json") output_filename = input_path.parts[-1].replace(".iob", ".json")
output_file = output_path / output_filename output_file = output_path / output_filename
with output_file.open('w', encoding='utf-8') as f: with output_file.open('w', encoding='utf-8') as f:
@ -23,9 +22,9 @@ def iob2json(input_path, output_path, n_sents=10, *a, **k):
title="Generated output file %s" % path2str(output_file)) title="Generated output file %s" % path2str(output_file))
def read_iob(file_): def read_iob(raw_sents):
sentences = [] sentences = []
for line in file_: for line in raw_sents:
if not line.strip(): if not line.strip():
continue continue
tokens = [t.split('|') for t in line.split()] tokens = [t.split('|') for t in line.split()]
@ -43,3 +42,15 @@ def read_iob(file_):
paragraphs = [{'sentences': [sent]} for sent in sentences] paragraphs = [{'sentences': [sent]} for sent in sentences]
docs = [{'id': 0, 'paragraphs': [para]} for para in paragraphs] docs = [{'id': 0, 'paragraphs': [para]} for para in paragraphs]
return docs return docs
def merge_sentences(docs, n_sents):
counter = 0
merged = []
for group in partition_all(n_sents, docs):
group = list(group)
first = group.pop(0)
to_extend = first['paragraphs'][0]['sentences']
for sent in group[1:]:
to_extend.extend(sent['paragraphs'][0]['sentences'])
merged.append(first)
return merged

119
spacy/cli/evaluate.py Normal file
View File

@ -0,0 +1,119 @@
# coding: utf8
from __future__ import unicode_literals, division, print_function
import plac
import json
from collections import defaultdict
import cytoolz
from pathlib import Path
import dill
import tqdm
from thinc.neural._classes.model import Model
from thinc.neural.optimizers import linear_decay
from timeit import default_timer as timer
import random
import numpy.random
from ..tokens.doc import Doc
from ..scorer import Scorer
from ..gold import GoldParse, merge_sents
from ..gold import GoldCorpus, minibatch
from ..util import prints
from .. import util
from .. import about
from .. import displacy
from ..compat import json_dumps
random.seed(0)
numpy.random.seed(0)
@plac.annotations(
model=("Model name or path", "positional", None, str),
data_path=("Location of JSON-formatted evaluation data", "positional", None, str),
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
gpu_id=("Use GPU", "option", "g", int),
displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str),
displacy_limit=("Limit of parses to render as HTML", "option", "dl", int)
)
def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
displacy_path=None, displacy_limit=25):
"""
Evaluate a model. To render a sample of parses in a HTML file, set an output
directory as the displacy_path argument.
"""
util.use_gpu(gpu_id)
util.set_env_log(False)
data_path = util.ensure_path(data_path)
displacy_path = util.ensure_path(displacy_path)
if not data_path.exists():
prints(data_path, title="Evaluation data not found", exits=1)
if displacy_path and not displacy_path.exists():
prints(displacy_path, title="Visualization output directory not found", exits=1)
corpus = GoldCorpus(data_path, data_path)
nlp = util.load_model(model)
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
begin = timer()
scorer = nlp.evaluate(dev_docs, verbose=False)
end = timer()
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
print_results(scorer, time=end - begin, words=nwords,
wps=nwords / (end - begin))
if displacy_path:
docs, golds = zip(*dev_docs)
render_deps = 'parser' in nlp.meta.get('pipeline', [])
render_ents = 'ner' in nlp.meta.get('pipeline', [])
render_parses(docs, displacy_path, model_name=model, limit=displacy_limit,
deps=render_deps, ents=render_ents)
prints(displacy_path, title="Generated %s parses as HTML" % displacy_limit)
def render_parses(docs, output_path, model_name='', limit=250, deps=True, ents=True):
docs[0].user_data['title'] = model_name
if ents:
with (output_path / 'entities.html').open('w') as file_:
html = displacy.render(docs[:limit], style='ent', page=True)
file_.write(html)
if deps:
with (output_path / 'parses.html').open('w') as file_:
html = displacy.render(docs[:limit], style='dep', page=True, options={'compact': True})
file_.write(html)
def print_progress(itn, losses, dev_scores, wps=0.0):
scores = {}
for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
'ents_p', 'ents_r', 'ents_f', 'wps']:
scores[col] = 0.0
scores['dep_loss'] = losses.get('parser', 0.0)
scores['ner_loss'] = losses.get('ner', 0.0)
scores['tag_loss'] = losses.get('tagger', 0.0)
scores.update(dev_scores)
scores['wps'] = wps
tpl = '\t'.join((
'{:d}',
'{dep_loss:.3f}',
'{ner_loss:.3f}',
'{uas:.3f}',
'{ents_p:.3f}',
'{ents_r:.3f}',
'{ents_f:.3f}',
'{tags_acc:.3f}',
'{token_acc:.3f}',
'{wps:.1f}'))
print(tpl.format(itn, **scores))
def print_results(scorer, time, words, wps):
results = {
'Time': '%.2f s' % time,
'Words': words,
'Words/s': '%.0f' % wps,
'TOK': '%.2f' % scorer.token_acc,
'POS': '%.2f' % scorer.tags_acc,
'UAS': '%.2f' % scorer.uas,
'LAS': '%.2f' % scorer.las,
'NER P': '%.2f' % scorer.ents_p,
'NER R': '%.2f' % scorer.ents_r,
'NER F': '%.2f' % scorer.ents_f}
util.print_table(results, title="Results")

View File

@ -105,8 +105,11 @@ def generate_pipeline():
"parser, ner. For more information, see the docs on processing pipelines.", "parser, ner. For more information, see the docs on processing pipelines.",
title="Enter your model's pipeline components") title="Enter your model's pipeline components")
pipeline = util.get_raw_input("Pipeline components", True) pipeline = util.get_raw_input("Pipeline components", True)
replace = {'True': True, 'False': False} subs = {'True': True, 'False': False}
return replace[pipeline] if pipeline in replace else pipeline.split(', ') if pipeline in subs:
return subs[pipeline]
else:
return [p.strip() for p in pipeline.split(',')]
def validate_meta(meta, keys): def validate_meta(meta, keys):

View File

@ -8,8 +8,11 @@ import cytoolz
from pathlib import Path from pathlib import Path
import dill import dill
import tqdm import tqdm
from thinc.neural._classes.model import Model
from thinc.neural.optimizers import linear_decay from thinc.neural.optimizers import linear_decay
from timeit import default_timer as timer from timeit import default_timer as timer
import random
import numpy.random
from ..tokens.doc import Doc from ..tokens.doc import Doc
from ..scorer import Scorer from ..scorer import Scorer
@ -17,9 +20,13 @@ from ..gold import GoldParse, merge_sents
from ..gold import GoldCorpus, minibatch from ..gold import GoldCorpus, minibatch
from ..util import prints from ..util import prints
from .. import util from .. import util
from .. import about
from .. import displacy from .. import displacy
from ..compat import json_dumps from ..compat import json_dumps
random.seed(0)
numpy.random.seed(0)
@plac.annotations( @plac.annotations(
lang=("model language", "positional", None, str), lang=("model language", "positional", None, str),
@ -29,15 +36,17 @@ from ..compat import json_dumps
n_iter=("number of iterations", "option", "n", int), n_iter=("number of iterations", "option", "n", int),
n_sents=("number of sentences", "option", "ns", int), n_sents=("number of sentences", "option", "ns", int),
use_gpu=("Use GPU", "option", "g", int), use_gpu=("Use GPU", "option", "g", int),
resume=("Whether to resume training", "flag", "R", bool), vectors=("Model to load vectors from", "option", "v"),
no_tagger=("Don't train tagger", "flag", "T", bool), no_tagger=("Don't train tagger", "flag", "T", bool),
no_parser=("Don't train parser", "flag", "P", bool), no_parser=("Don't train parser", "flag", "P", bool),
no_entities=("Don't train NER", "flag", "N", bool), no_entities=("Don't train NER", "flag", "N", bool),
gold_preproc=("Use gold preprocessing", "flag", "G", bool), gold_preproc=("Use gold preprocessing", "flag", "G", bool),
version=("Model version", "option", "V", str),
meta_path=("Optional path to meta.json. All relevant properties will be overwritten.", "option", "m", Path)
) )
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False, use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False,
gold_preproc=False): gold_preproc=False, version="0.0.0", meta_path=None):
""" """
Train a model. Expects data in spaCy's JSON format. Train a model. Expects data in spaCy's JSON format.
""" """
@ -46,19 +55,24 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
output_path = util.ensure_path(output_dir) output_path = util.ensure_path(output_dir)
train_path = util.ensure_path(train_data) train_path = util.ensure_path(train_data)
dev_path = util.ensure_path(dev_data) dev_path = util.ensure_path(dev_data)
meta_path = util.ensure_path(meta_path)
if not output_path.exists(): if not output_path.exists():
output_path.mkdir() output_path.mkdir()
if not train_path.exists(): if not train_path.exists():
prints(train_path, title="Training data not found", exits=1) prints(train_path, title="Training data not found", exits=1)
if dev_path and not dev_path.exists(): if dev_path and not dev_path.exists():
prints(dev_path, title="Development data not found", exits=1) prints(dev_path, title="Development data not found", exits=1)
if meta_path is not None and not meta_path.exists():
prints(meta_path, title="meta.json not found", exits=1)
meta = util.read_json(meta_path) if meta_path else {}
if not isinstance(meta, dict):
prints("Expected dict but got: {}".format(type(meta)),
title="Not a valid meta.json format", exits=1)
lang_class = util.get_lang_class(lang) pipeline = ['tagger', 'parser', 'ner']
if no_tagger and 'tagger' in pipeline: pipeline.remove('tagger')
pipeline = ['token_vectors', 'tags', 'dependencies', 'entities'] if no_parser and 'parser' in pipeline: pipeline.remove('parser')
if no_tagger and 'tags' in pipeline: pipeline.remove('tags') if no_entities and 'ner' in pipeline: pipeline.remove('ner')
if no_parser and 'dependencies' in pipeline: pipeline.remove('dependencies')
if no_entities and 'entities' in pipeline: pipeline.remove('entities')
# Take dropout and batch size as generators of values -- dropout # Take dropout and batch size as generators of values -- dropout
# starts high and decays sharply, to force the optimizer to explore. # starts high and decays sharply, to force the optimizer to explore.
@ -68,33 +82,30 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
util.env_opt('dropout_to', 0.2), util.env_opt('dropout_to', 0.2),
util.env_opt('dropout_decay', 0.0)) util.env_opt('dropout_decay', 0.0))
batch_sizes = util.compounding(util.env_opt('batch_from', 1), batch_sizes = util.compounding(util.env_opt('batch_from', 1),
util.env_opt('batch_to', 64), util.env_opt('batch_to', 16),
util.env_opt('batch_compound', 1.001)) util.env_opt('batch_compound', 1.001))
if resume:
prints(output_path / 'model9.pickle', title="Resuming training")
nlp = dill.load((output_path / 'model9.pickle').open('rb'))
else:
nlp = lang_class(pipeline=pipeline)
corpus = GoldCorpus(train_path, dev_path, limit=n_sents) corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
n_train_words = corpus.count_train() n_train_words = corpus.count_train()
lang_class = util.get_lang_class(lang)
nlp = lang_class(pipeline=pipeline)
if vectors:
util.load_model(vectors, vocab=nlp.vocab)
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
nlp._optimizer = None
print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %") print("Itn.\tP.Loss\tN.Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
try: try:
train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0,
gold_preproc=gold_preproc, max_length=0)
train_docs = list(train_docs)
for i in range(n_iter): for i in range(n_iter):
if resume:
i += 20
with tqdm.tqdm(total=n_train_words, leave=False) as pbar: with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0,
gold_preproc=gold_preproc, max_length=0)
losses = {} losses = {}
for batch in minibatch(train_docs, size=batch_sizes): for batch in minibatch(train_docs, size=batch_sizes):
docs, golds = zip(*batch) docs, golds = zip(*batch)
nlp.update(docs, golds, sgd=optimizer, nlp.update(docs, golds, sgd=optimizer,
drop=next(dropout_rates), losses=losses, drop=next(dropout_rates), losses=losses)
update_shared=True)
pbar.update(sum(len(doc) for doc in docs)) pbar.update(sum(len(doc) for doc in docs))
with nlp.use_params(optimizer.averages): with nlp.use_params(optimizer.averages):
@ -104,12 +115,22 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
nlp_loaded = lang_class(pipeline=pipeline) nlp_loaded = lang_class(pipeline=pipeline)
nlp_loaded = nlp_loaded.from_disk(epoch_model_path) nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
scorer = nlp_loaded.evaluate( scorer = nlp_loaded.evaluate(
corpus.dev_docs( list(corpus.dev_docs(
nlp_loaded, nlp_loaded,
gold_preproc=gold_preproc)) gold_preproc=gold_preproc)))
acc_loc =(output_path / ('model%d' % i) / 'accuracy.json') acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
with acc_loc.open('w') as file_: with acc_loc.open('w') as file_:
file_.write(json_dumps(scorer.scores)) file_.write(json_dumps(scorer.scores))
meta_loc = output_path / ('model%d' % i) / 'meta.json'
meta['accuracy'] = scorer.scores
meta['lang'] = nlp.lang
meta['pipeline'] = pipeline
meta['spacy_version'] = '>=%s' % about.__version__
meta.setdefault('name', 'model%d' % i)
meta.setdefault('version', version)
with meta_loc.open('w') as file_:
file_.write(json_dumps(meta))
util.set_env_log(True) util.set_env_log(True)
print_progress(i, losses, scorer.scores) print_progress(i, losses, scorer.scores)
finally: finally:
@ -138,12 +159,14 @@ def print_progress(itn, losses, dev_scores, wps=0.0):
'ents_p', 'ents_r', 'ents_f', 'wps']: 'ents_p', 'ents_r', 'ents_f', 'wps']:
scores[col] = 0.0 scores[col] = 0.0
scores['dep_loss'] = losses.get('parser', 0.0) scores['dep_loss'] = losses.get('parser', 0.0)
scores['ner_loss'] = losses.get('ner', 0.0)
scores['tag_loss'] = losses.get('tagger', 0.0) scores['tag_loss'] = losses.get('tagger', 0.0)
scores.update(dev_scores) scores.update(dev_scores)
scores['wps'] = wps scores['wps'] = wps
tpl = '\t'.join(( tpl = '\t'.join((
'{:d}', '{:d}',
'{dep_loss:.3f}', '{dep_loss:.3f}',
'{ner_loss:.3f}',
'{uas:.3f}', '{uas:.3f}',
'{ents_p:.3f}', '{ents_p:.3f}',
'{ents_r:.3f}', '{ents_r:.3f}',

View File

@ -7,6 +7,7 @@ import re
import ujson import ujson
import random import random
import cytoolz import cytoolz
import itertools
from .syntax import nonproj from .syntax import nonproj
from .util import ensure_path from .util import ensure_path
@ -146,9 +147,13 @@ def minibatch(items, size=8):
'''Iterate over batches of items. `size` may be an iterator, '''Iterate over batches of items. `size` may be an iterator,
so that batch-size can vary on each step. so that batch-size can vary on each step.
''' '''
if isinstance(size, int):
size_ = itertools.repeat(8)
else:
size_ = size
items = iter(items) items = iter(items)
while True: while True:
batch_size = next(size) #if hasattr(size, '__next__') else size batch_size = next(size_)
batch = list(cytoolz.take(int(batch_size), items)) batch = list(cytoolz.take(int(batch_size), items))
if len(batch) == 0: if len(batch) == 0:
break break

View File

@ -29,9 +29,9 @@ _units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm
'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb ' 'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb '
'TB T G M K %') 'TB T G M K %')
_currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$' _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
_punct = r', : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &' _punct = r'…… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* & ·'
_quotes = r'\' \'\' " ” “ `` ` ´ , „ » «' _quotes = r'\' \'\' " ” “ `` ` ´ , „ » «'
_hyphens = '- — -- ---' _hyphens = '- — -- --- —— ~'
_other_symbols = r'[\p{So}]' _other_symbols = r'[\p{So}]'
UNITS = merge_chars(_units) UNITS = merge_chars(_units)

View File

@ -3,6 +3,7 @@ from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .norm_exceptions import NORM_EXCEPTIONS from .norm_exceptions import NORM_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lemmatizer import LOOKUP from .lemmatizer import LOOKUP
@ -23,6 +24,7 @@ class GermanDefaults(Language.Defaults):
NORM_EXCEPTIONS, BASE_NORMS) NORM_EXCEPTIONS, BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
infixes = tuple(TOKENIZER_INFIXES)
tag_map = dict(TAG_MAP) tag_map = dict(TAG_MAP)
stop_words = set(STOP_WORDS) stop_words = set(STOP_WORDS)
syntax_iterators = dict(SYNTAX_ITERATORS) syntax_iterators = dict(SYNTAX_ITERATORS)

View File

@ -0,0 +1,20 @@
# coding: utf8
from __future__ import unicode_literals
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
from ..char_classes import QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
_quotes = QUOTES.replace("'", '')
_infixes = (LIST_ELLIPSES + LIST_ICONS +
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}])([{q}\)\]\(\[])(?=[\{a}])'.format(a=ALPHA, q=_quotes),
r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA),
r'(?<=[0-9])-(?=[0-9])'])
TOKENIZER_INFIXES = _infixes

View File

@ -4,6 +4,7 @@ from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .lemmatizer import LOOKUP from .lemmatizer import LOOKUP
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
@ -17,6 +18,7 @@ from ...util import update_exc, add_lookups
class FrenchDefaults(Language.Defaults): class FrenchDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: 'fr' lex_attr_getters[LANG] = lambda text: 'fr'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)

View File

@ -0,0 +1,41 @@
# coding: utf8
from __future__ import unicode_literals
from ...attrs import LIKE_NUM
_num_words = set("""
zero un deux trois quatre cinq six sept huit neuf dix
onze douze treize quatorze quinze seize dix-sept dix-huit dix-neuf
vingt trente quanrante cinquante soixante septante quatre-vingt huitante nonante
cent mille mil million milliard billion quadrillion quintillion
sextillion septillion octillion nonillion decillion
""".split())
_ordinal_words = set("""
premier deuxième second troisième quatrième cinquième sixième septième huitième neuvième dixième
onzième douzième treizième quatorzième quinzième seizième dix-septième dix-huitième dix-neufième
vingtième trentième quanrantième cinquantième soixantième septantième quatre-vingtième huitantième nonantième
centième millième millionnième milliardième billionnième quadrillionnième quintillionnième
sextillionnième septillionnième octillionnième nonillionnième decillionnième
""".split())
def like_num(text):
# Might require more work?
# See this discussion: https://github.com/explosion/spaCy/pull/1161
text = text.replace(',', '').replace('.', '')
if text.isdigit():
return True
if text.count('/') == 1:
num, denom = text.split('/')
if num.isdigit() and denom.isdigit():
return True
if text in _num_words:
return True
return False
LEX_ATTRS = {
LIKE_NUM: like_num
}

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS from ..norm_exceptions import BASE_NORMS
@ -12,6 +13,7 @@ from ...util import update_exc, add_lookups
class DutchDefaults(Language.Defaults): class DutchDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: 'nl' lex_attr_getters[LANG] = lambda text: 'nl'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)

View File

@ -0,0 +1,40 @@
# coding: utf8
from __future__ import unicode_literals
from ...attrs import LIKE_NUM
_num_words = set("""
nul een één twee drie vier vijf zes zeven acht negen tien elf twaalf dertien
veertien twintig dertig veertig vijftig zestig zeventig tachtig negentig honderd
duizend miljoen miljard biljoen biljard triljoen triljard
""".split())
_ordinal_words = set("""
eerste tweede derde vierde vijfde zesde zevende achtste negende tiende elfde
twaalfde dertiende veertiende twintigste dertigste veertigste vijftigste
zestigste zeventigste tachtigste negentigste honderdste duizendste miljoenste
miljardste biljoenste biljardste triljoenste triljardste
""".split())
def like_num(text):
# This only does the most basic check for whether a token is a digit
# or matches one of the number words. In order to handle numbers like
# "drieëntwintig", more work is required.
# See this discussion: https://github.com/explosion/spaCy/pull/1177
text = text.replace(',', '').replace('.', '')
if text.isdigit():
return True
if text.count('/') == 1:
num, denom = text.split('/')
if num.isdigit() and denom.isdigit():
return True
if text in _num_words:
return True
return False
LEX_ATTRS = {
LIKE_NUM: like_num
}

35
spacy/lang/th/__init__.py Normal file
View File

@ -0,0 +1,35 @@
# coding: utf8
from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...tokens import Doc
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
class ThaiDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'th'
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
tag_map = dict(TAG_MAP)
stop_words = set(STOP_WORDS)
class Thai(Language):
lang = 'th'
Defaults = ThaiDefaults
def make_doc(self, text):
try:
from pythainlp.tokenize import word_tokenize
except ImportError:
raise ImportError("The Thai tokenizer requires the PyThaiNLP library: "
"https://github.com/wannaphongcom/pythainlp/")
words = [x for x in list(word_tokenize(text,"newmm"))]
return Doc(self.vocab, words=words, spaces=[False]*len(words))
__all__ = ['Thai']

View File

@ -0,0 +1,62 @@
# encoding: utf8
from __future__ import unicode_literals
# data from https://github.com/wannaphongcom/pythainlp/blob/dev/pythainlp/corpus/stopwords-th.txt
# stop words as whitespace-separated list
STOP_WORDS = set("""
นอกจาก าให ทาง งน วง จาก จะ ความ คร คง ของ
ขอ ระหวาง รวม มาก มา พรอม พบ าน ผล บาง เปดเผย เป เนองจาก เดยวก เดยว เช เฉพาะ เข
อง างๆ าง ตาม งแต าน วย อาจ ออก อยาง อะไร อย อยาก หาก หลาย หลงจาก แต เอง เห
เลย เร เรา เม เพ เพราะ เปนการ เป หล หร หน วน าหร ลง วม ราย ขณะ อน การ
กว กลาว ไว ไป ได ให ใน โดย แห แล และ แรก แบบ เขา เคย ไม อยาก เก เกนๆ เกยวก เกยวก
เกยวของ เกยวเนอง เกยวๆ เกอบ เกอบจะ เกอบๆ แก แก แกไข ใกล ใกล ไกล ไกลๆ ขณะเดยวก ขณะใด ขณะใดๆ ขณะท ขณะน ขณะน ขณะหน ขวาง
ขวางๆ ใคร ใคร ใครจะ ใครๆ าย ายๆ ไง จง จด จน จนกระท จนกว จนขณะน จนตลอด จนถ จนท จนบดน จนเม จนแม จนแม
จรด จรดก จร จรงจ จรงๆ จรงๆจงๆ จวน จวนจะ จวนเจยน จวบ งก งก งก งกนและก งไดแก งๆ วย วยก วยเชนก วยท วยประการฉะน
วยเพราะ วยว วยเหต วยเหต วยเหต วยเหตเพราะ วยเหต วยเหมอนก งกลาว งก งก งกบว งกบว งเก
งเก งเคย ใดๆ ได ไดแก ไดแต ได ไดมา ได ตน ตนเอง ตนฯ ตรง ตรงๆ ตลอด ตลอดกาล ตลอดกาลนาน ตลอดจน ตลอดถ ตลอดท
ตลอดท ตลอดทวถ ตลอดทวท ตลอดป ตลอดไป ตลอดมา ตลอดระยะเวลา ตลอดว ตลอดเวลา ตลอดศก อก งแก งจะ งบดน งบดน
งเม งเมอใด งเมอไร งแม งแมจะ งแม งอยางไร อว กตอง กๆ เถอะ เถ ทรง ทว งคน งต งท งท งน งนนดวย งนนเพราะ
นอก นอกจากท นอกจากน นอกจากน นอกจากว นอกน นอกเหน นอกเหนอจาก อย อยกว อยๆ นะ กๆ นไง นเป นแหละ
นเอง นๆ บจากน บจากน บตงแต บแต บแต บแต เปนต เปนตนไป เปนตนมา เปนแต เปนแตเพยง เปนท เปนท เปนท เปนเพราะ
เปนเพราะว เปนเพยง เปนเพยงว เปนเพ เปนอ เปนอนมาก เปนอนว เปนอนๆ เปนอาท เปนๆ เปลยน เปลยนแปลง เป เปดเผย ไป าน านๆ
ดๆ เพยงเพ เพยงไร เพยงไหน เพอท เพอทจะ เพอว เพอให ภาค ภาคฯ ภาย ภายใต ภายนอก ภายใน ภายภาค ภายภาคหน ภายหน ภายหล
มอง มองว กจะ นๆ ยนะ ยน ยเน ยล นนาน นยง นย นยาว เยอะ เยอะแยะ เยอะๆ แยะ แยะๆ รวด รวดเร วม รวมก วมก
รวมดวย วมดวย รวมถ รวมท วมม รวมๆ ระยะ ระยะๆ ระหวาง บรอง อว นกาลนาน บเนอง ดๆ งกว งส งส งๆ เสมอนก
เสมอนว เสร เสรจก เสรจแล เสรจสมบรณ เสรจส เส เสยกอน เสยจน เสยจนกระท เสยจนถ เสยดวย เสยน เสยนนเอง เสยน เสยนกระไร เสยย
เสยยงน เสยแล ใหญ ให ใหแด ใหไป ใหม ใหมา ใหม ไหน ไหนๆ อด อน อยาง อยางเช อยางด อยางเดยว อยางใด อยางท อยางนอย อยางน
อยางน อยางโน แค จะ ได อเม ตาม ตามแต ตามท แลวแต กระท กระทำ กระน กระผม กล กลาวค กล กลมกอน
กลมๆ กวาง กวางขวาง กวางๆ อนหน อนหนาน อนๆ นดกว นดไหม นเถอะ นนะ นและก นไหม นเอง กำล กำลงจะ กำหนด เก
เก เกยวของ แก แกไข ใกล ใกล าง างเคยง างต างบน างลาง างๆ ขาด าพเจ าฯ เขาใจ เขยน คงจะ คงอย ครบ ครบคร ครบถวน
ครงกระน ครงกอน ครงครา ครงคราว ครงใด ครงท ครงน ครงน ครงละ ครงหน ครงหล ครงหลงส ครงไหน ครงๆ คร คร ครา คราใด คราท คราน คราน คราหน
คราไหน คราว คราวกอน คราวใด คราวท คราวน คราวน คราวโน คราวละ คราวหน คราวหน คราวหล คราวไหน คราวๆ คลาย คลายก คลายกนก
คลายก คลายกบว คลายว ควร อน อนขาง อนขางจะ อยไปทาง อนมาทาง อย อยๆ คะ คำ ดว ณๆ
เคยๆ แค แคจะ แค แค แคเพยง แค แคไหน ใคร ใครจะ าย ายๆ จนกว จนแม จนแม งๆ จวบก จวบจน จะได ดการ ดงาน ดแจง
ดต ดทำ ดหา ดให จากน จากน จากนไป จำ จำเป จำพวก งจะ งเป ฉะน ฉะน เฉกเช เฉย เฉยๆ ไฉน วงกอน
วงตอไป วงถดไป วงทาย วงท วงน วงน วงระหวาง วงแรก วงหน วงหล วงๆ วย านาน ชาว าๆ เชนกอน เชนก เชนเคย
เชนด เชนดงกอน เชนดงเก เชนดงท เชนดงว เชนเดยวก เชนเดยวก เชนใด เชนท เชนทเคย เชนท เชนน เชนนนเอง เชนน เชนเม เชนไร เช
เชอถ เชอม เชอว ใช ใชไหม ใช ซะ ซะกอน ซะจน ซะจนกระท ซะจนถ งไดแก วยก วยเชนก วยท วยเพราะ วยว วยเหต วยเหต
วยเหต วยเหตเพราะ วยเหต วยเหมอนก งกลาว งกบว งกบว งเก งเก งเคย างก างหาก ตามดวย ตามแต ตามท
ตามๆ เตมไปดวย เตมไปหมด เตมๆ แต แตอน แตจะ แตเด แตอง แต แตทว แต แต แตเพยง แตเม แตไร แตละ แต แตไหน แตอยางใด โต
โตๆ ใต าจะ าหาก งแก งแม งแมจะ งแม งอยางไร อว กตอง ทว งนนดวย งปวง งเป งมวล งส งหมด งหลาย งๆ
นใดน นท นทนใด ทำไม ทำไร ทำให ทำๆ จร เดยว ใด ใด ได เถอะ แท แทจร ไร ละ ละ
แล แหงน ไหน กคน กคร กครา กคราว กช กต กทาง กท กท กเม กว กวนน กส กหน กแห กอยาง
กอ กๆ เท เทาก เทาก เทาใด เทาท เทาน เทาน เทาไร เทาไหร แท แทจร เธอ นอกจากว อย อยกว อยๆ นไว บแต นาง
นางสาว าจะ นาน นานๆ นาย นำ นำพา นำมา ดหนอย ดๆ ไง นา แน แหละ แหล เอง เอง เน เน
เนยเอง ในชวง ในท ในเม ในระหวาง บน บอก บอกแล บอกว อย อยกว อยคร อยๆ ดดล ดเดยวน ดน ดน าง บางกว
บางขณะ บางคร บางครา บางคราว บางท บางท บางแห บางๆ ปฏ ประกอบ ประการ ประการฉะน ประการใด ประการหน ประมาณ ประสบ ปร
ปรากฏ ปรากฏว จจ เปนดวย เปนด เปนต เปนแต เปนเพ เปนอ เปนอนมาก เปนอาท านๆ ใด เผ เผอจะ เผอท เผอว าย
ายใด พบว พยายาม พรอมก พรอมก พรอมดวย พรอมท พรอมท พรอมเพยง พวก พวกก พวกก พวกแก พวกเขา พวกค พวกฉ พวกทาน
พวกท พวกเธอ พวกน พวกน พวกน พวกโน พวกม พวกม พอ พอก พอควร พอจะ พอด พอต พอท พอท พอเพยง พอแล พอสม พอสมควร
พอเหมาะ พอๆ พา นๆ เพราะฉะน เพราะว เพ เพงจะ เพ เพมเต เพยง เพยงแค เพยงใด เพยงแต เพยงพอ เพยงเพราะ
เพอว เพอให ภายใต มองว มากกว มากมาย ฉะน ใช ได แต งเน งหมาย เมอกอน เมอคร เมอครงกอน
เมอคราวกอน เมอคราวท เมอคราว เมอค เมอเช เมอใด เมอน เมอน เมอเย เมอไร เมอวนวาน เมอวาน เมอไหร แม แมกระท แมแต แมนว แม
ไมอย ไมอยจะ ไมอยเป ไมใช ไมเปนไร ไม ยก ยกให ยอม ยอมร อม อย งคง งง งง งโง งไง งจะ งแต ยาก
ยาว ยาวนาน งกว งข งขนไป งจน งจะ งน งเม งแล งใหญ วมก รวมดวย วมดวย อว เร เรวๆ เราๆ เรยก เรยบ เรอย
เรอยๆ ไร วน วนจน วนแต ละ าส เล เลกนอย เลกๆ เลาว แลวก แลวแต แลวเสร นใด นน นน นไหน สบาย สม สมยกอน
สมยน สมยน สมยโน วนเก วนดอย วนด วนใด วนท วนนอย วนน วนมาก วนใหญ นๆ สามารถ สำค
งใด งน งน งไหน เสรจแล เสยดวย เสยแล แสดง แสดงว หน หนอ หนอย หนอย หมด หมดก หมดส หรอไง หรอเปล หรอไม หรอย
หรอไร หากแม หากแม หากแมนว หากว หาความ หาใช หาร เหต เหตผล เหต เหต เหตไร เหนแก เหนควร เหนจะ เหนว เหล เหลอเก เหล
เหลาน เหลาน แหงใด แหงน แหงน แหงโน แหงไหน แหละ ใหแก ใหญ ใหญโต อยางเช อยางด อยางเดยว อยางใด อยางท อยางนอย อยางน อยางน
อยางโน อยางมาก อยางย อยางไร อยางไรก อยางไรกได อยางไรเส อยางละ อยางหน อยางไหน อยางๆ นจะ นใด นไดแก นท
นทจร นทจะ นเนองมาจาก นละ นไหน นๆ อาจจะ อาจเป อาจเปนดวย นๆ เอ เอา ฯล ฯลฯ
""".split())

81
spacy/lang/th/tag_map.py Normal file
View File

@ -0,0 +1,81 @@
# encoding: utf8
# data from Korakot Chaovavanich (https://www.facebook.com/photo.php?fbid=390564854695031&set=p.390564854695031&type=3&permPage=1&ifg=1)
from __future__ import unicode_literals
from ...symbols import *
TAG_MAP = {
#NOUN
"NOUN": {POS: NOUN},
"NCMN": {POS: NOUN},
"NTTL": {POS: NOUN},
"CNIT": {POS: NOUN},
"CLTV": {POS: NOUN},
"CMTR": {POS: NOUN},
"CFQC": {POS: NOUN},
"CVBL": {POS: NOUN},
#PRON
"PRON": {POS: PRON},
"NPRP": {POS: PRON},
# ADJ
"ADJ": {POS: ADJ},
"NONM": {POS: ADJ},
"VATT": {POS: ADJ},
"DONM": {POS: ADJ},
# ADV
"ADV": {POS: ADV},
"ADVN": {POS: ADV},
"ADVI": {POS: ADV},
"ADVP": {POS: ADV},
"ADVS": {POS: ADV},
# INT
"INT": {POS: INTJ},
# PRON
"PROPN": {POS: PROPN},
"PPRS": {POS: PROPN},
"PDMN": {POS: PROPN},
"PNTR": {POS: PROPN},
# DET
"DET": {POS: DET},
"DDAN": {POS: DET},
"DDAC": {POS: DET},
"DDBQ": {POS: DET},
"DDAQ": {POS: DET},
"DIAC": {POS: DET},
"DIBQ": {POS: DET},
"DIAQ": {POS: DET},
"DCNM": {POS: DET},
# NUM
"NUM": {POS: NUM},
"NCNM": {POS: NUM},
"NLBL": {POS: NUM},
"DCNM": {POS: NUM},
# AUX
"AUX": {POS: AUX},
"XVBM": {POS: AUX},
"XVAM": {POS: AUX},
"XVMM": {POS: AUX},
"XVBB": {POS: AUX},
"XVAE": {POS: AUX},
# ADP
"ADP": {POS: ADP},
"RPRE": {POS: ADP},
# CCONJ
"CCONJ": {POS: CCONJ},
"JCRG": {POS: CCONJ},
# SCONJ
"SCONJ": {POS: SCONJ},
"PREL": {POS: SCONJ},
"JSBR": {POS: SCONJ},
"JCMP": {POS: SCONJ},
# PART
"PART": {POS: PART},
"FIXN": {POS: PART},
"FIXV": {POS: PART},
"EAFF": {POS: PART},
"AITT": {POS: PART},
"NEG": {POS: PART},
# PUNCT
"PUNCT": {POS: PUNCT},
"PUNC": {POS: PUNCT}
}

View File

@ -0,0 +1,43 @@
# encoding: utf8
from __future__ import unicode_literals
from ...symbols import *
TOKENIZER_EXCEPTIONS = {
"ม.ค.": [
{ORTH: "ม.ค.", LEMMA: "มกราคม"}
],
"ก.พ.": [
{ORTH: "ก.พ.", LEMMA: "กุมภาพันธ์"}
],
"มี.ค.": [
{ORTH: "มี.ค.", LEMMA: "มีนาคม"}
],
"เม.ย.": [
{ORTH: "เม.ย.", LEMMA: "เมษายน"}
],
"พ.ค.": [
{ORTH: "พ.ค.", LEMMA: "พฤษภาคม"}
],
"มิ.ย.": [
{ORTH: "มิ.ย.", LEMMA: "มิถุนายน"}
],
"ก.ค.": [
{ORTH: "ก.ค.", LEMMA: "กรกฎาคม"}
],
"ส.ค.": [
{ORTH: "ส.ค.", LEMMA: "สิงหาคม"}
],
"ก.ย.": [
{ORTH: "ก.ย.", LEMMA: "กันยายน"}
],
"ต.ค.": [
{ORTH: "ต.ค.", LEMMA: "ตุลาคม"}
],
"พ.ย.": [
{ORTH: "พ.ย.", LEMMA: "พฤศจิกายน"}
],
"ธ.ค.": [
{ORTH: "ธ.ค.", LEMMA: "ธันวาคม"}
]
}

View File

@ -14,8 +14,8 @@ class Chinese(Language):
except ImportError: except ImportError:
raise ImportError("The Chinese tokenizer requires the Jieba library: " raise ImportError("The Chinese tokenizer requires the Jieba library: "
"https://github.com/fxsjy/jieba") "https://github.com/fxsjy/jieba")
words = list(jieba.cut(text, cut_all=True)) words = list(jieba.cut(text, cut_all=False))
words=[x for x in words if x] words = [x for x in words if x]
return Doc(self.vocab, words=words, spaces=[False]*len(words)) return Doc(self.vocab, words=words, spaces=[False]*len(words))

View File

@ -34,6 +34,7 @@ from .lang.tag_map import TAG_MAP
from .lang.lex_attrs import LEX_ATTRS from .lang.lex_attrs import LEX_ATTRS
from . import util from . import util
from .scorer import Scorer from .scorer import Scorer
from ._ml import link_vectors_to_models
class BaseDefaults(object): class BaseDefaults(object):
@ -278,8 +279,7 @@ class Language(object):
def make_doc(self, text): def make_doc(self, text):
return self.tokenizer(text) return self.tokenizer(text)
def update(self, docs, golds, drop=0., sgd=None, losses=None, def update(self, docs, golds, drop=0., sgd=None, losses=None):
update_shared=False):
"""Update the models in the pipeline. """Update the models in the pipeline.
docs (iterable): A batch of `Doc` objects. docs (iterable): A batch of `Doc` objects.
@ -303,32 +303,17 @@ class Language(object):
if self._optimizer is None: if self._optimizer is None:
self._optimizer = Adam(Model.ops, 0.001) self._optimizer = Adam(Model.ops, 0.001)
sgd = self._optimizer sgd = self._optimizer
tok2vec = self.pipeline[0]
feats = tok2vec.doc2feats(docs)
grads = {} grads = {}
def get_grads(W, dW, key=None): def get_grads(W, dW, key=None):
grads[key] = (W, dW) grads[key] = (W, dW)
pipes = list(self.pipeline[1:]) pipes = list(self.pipeline)
random.shuffle(pipes) random.shuffle(pipes)
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
all_d_tokvecses = [tok2vec.model.ops.allocate(tv.shape) for tv in tokvecses]
for proc in pipes: for proc in pipes:
if not hasattr(proc, 'update'): if not hasattr(proc, 'update'):
continue continue
d_tokvecses = proc.update((docs, tokvecses), golds, proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses)
drop=drop, sgd=get_grads, losses=losses)
if update_shared and d_tokvecses is not None:
for i, d_tv in enumerate(d_tokvecses):
all_d_tokvecses[i] += d_tv
if update_shared and bp_tokvecses is not None:
bp_tokvecses(all_d_tokvecses, sgd=sgd)
for key, (W, dW) in grads.items(): for key, (W, dW) in grads.items():
sgd(W, dW, key=key) sgd(W, dW, key=key)
# Clear the tensor variable, to free GPU memory.
# If we don't do this, the memory leak gets pretty
# bad, because we may be holding part of a batch.
for doc in docs:
doc.tensor = None
def preprocess_gold(self, docs_golds): def preprocess_gold(self, docs_golds):
"""Can be called before training to pre-process gold data. By default, """Can be called before training to pre-process gold data. By default,
@ -343,36 +328,49 @@ class Language(object):
for doc, gold in docs_golds: for doc, gold in docs_golds:
yield doc, gold yield doc, gold
def begin_training(self, get_gold_tuples, **cfg): def resume_training(self, **cfg):
if cfg.get('device', -1) >= 0:
device = util.use_gpu(cfg['device'])
if self.vocab.vectors.data.shape[1] >= 1:
self.vocab.vectors.data = Model.ops.asarray(
self.vocab.vectors.data)
else:
device = None
learn_rate = util.env_opt('learn_rate', 0.001)
beta1 = util.env_opt('optimizer_B1', 0.9)
beta2 = util.env_opt('optimizer_B2', 0.999)
eps = util.env_opt('optimizer_eps', 1e-08)
L2 = util.env_opt('L2_penalty', 1e-6)
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
beta2=beta2, eps=eps)
self._optimizer.max_grad_norm = max_grad_norm
self._optimizer.device = device
return self._optimizer
def begin_training(self, get_gold_tuples=None, **cfg):
"""Allocate models, pre-process training data and acquire a trainer and """Allocate models, pre-process training data and acquire a trainer and
optimizer. Used as a contextmanager. optimizer. Used as a contextmanager.
gold_tuples (iterable): Gold-standard training data. get_gold_tuples (function): Function returning gold data
**cfg: Config parameters. **cfg: Config parameters.
YIELDS (tuple): A trainer and an optimizer. returns: An optimizer
EXAMPLE:
>>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
>>> for epoch in trainer.epochs(gold):
>>> for docs, golds in epoch:
>>> state = nlp.update(docs, golds, sgd=optimizer)
""" """
if self.parser:
self.pipeline.append(NeuralLabeller(self.vocab))
# Populate vocab # Populate vocab
for _, annots_brackets in get_gold_tuples(): if get_gold_tuples is not None:
for annots, _ in annots_brackets: for _, annots_brackets in get_gold_tuples():
for word in annots[1]: for annots, _ in annots_brackets:
_ = self.vocab[word] for word in annots[1]:
_ = self.vocab[word]
contexts = [] contexts = []
if cfg.get('device', -1) >= 0: if cfg.get('device', -1) >= 0:
import cupy.cuda.device device = util.use_gpu(cfg['device'])
device = cupy.cuda.device.Device(cfg['device']) if self.vocab.vectors.data.shape[1] >= 1:
device.use() self.vocab.vectors.data = Model.ops.asarray(
Model.ops = CupyOps() self.vocab.vectors.data)
Model.Ops = CupyOps
else: else:
device = None device = None
link_vectors_to_models(self.vocab)
for proc in self.pipeline: for proc in self.pipeline:
if hasattr(proc, 'begin_training'): if hasattr(proc, 'begin_training'):
context = proc.begin_training(get_gold_tuples(), context = proc.begin_training(get_gold_tuples(),
@ -390,7 +388,7 @@ class Language(object):
self._optimizer.device = device self._optimizer.device = device
return self._optimizer return self._optimizer
def evaluate(self, docs_golds): def evaluate(self, docs_golds, verbose=False):
scorer = Scorer() scorer = Scorer()
docs, golds = zip(*docs_golds) docs, golds = zip(*docs_golds)
docs = list(docs) docs = list(docs)
@ -403,8 +401,9 @@ class Language(object):
docs = list(pipe.pipe(docs)) docs = list(pipe.pipe(docs))
assert len(docs) == len(golds) assert len(docs) == len(golds)
for doc, gold in zip(docs, golds): for doc, gold in zip(docs, golds):
scorer.score(doc, gold) if verbose:
doc.tensor = None print(doc)
scorer.score(doc, gold, verbose=verbose)
return scorer return scorer
@contextmanager @contextmanager
@ -493,7 +492,6 @@ class Language(object):
""" """
path = util.ensure_path(path) path = util.ensure_path(path)
serializers = OrderedDict(( serializers = OrderedDict((
('vocab', lambda p: self.vocab.to_disk(p)),
('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)), ('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)),
('meta.json', lambda p: p.open('w').write(json_dumps(self.meta))) ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
)) ))
@ -505,6 +503,7 @@ class Language(object):
if not hasattr(proc, 'to_disk'): if not hasattr(proc, 'to_disk'):
continue continue
serializers[proc.name] = lambda p, proc=proc: proc.to_disk(p, vocab=False) serializers[proc.name] = lambda p, proc=proc: proc.to_disk(p, vocab=False)
serializers['vocab'] = lambda p: self.vocab.to_disk(p)
util.to_disk(path, serializers, {p: False for p in disable}) util.to_disk(path, serializers, {p: False for p in disable})
def from_disk(self, path, disable=tuple()): def from_disk(self, path, disable=tuple()):

View File

@ -38,7 +38,8 @@ class Lemmatizer(object):
avoid lemmatization entirely. avoid lemmatization entirely.
""" """
morphology = {} if morphology is None else morphology morphology = {} if morphology is None else morphology
others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')] others = [key for key in morphology
if key not in (POS, 'Number', 'POS', 'VerbForm', 'Tense')]
true_morph_key = morphology.get('morph', 0) true_morph_key = morphology.get('morph', 0)
if univ_pos == 'noun' and morphology.get('Number') == 'sing': if univ_pos == 'noun' and morphology.get('Number') == 'sing':
return True return True
@ -47,7 +48,9 @@ class Lemmatizer(object):
# This maps 'VBP' to base form -- probably just need 'IS_BASE' # This maps 'VBP' to base form -- probably just need 'IS_BASE'
# morphology # morphology
elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and \ elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and \
morphology.get('Tense') == 'pres'): morphology.get('Tense') == 'pres' and \
morphology.get('Number') is None and \
not others):
return True return True
elif univ_pos == 'adj' and morphology.get('Degree') == 'pos': elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
return True return True

View File

@ -421,47 +421,69 @@ cdef class PhraseMatcher:
cdef int max_length cdef int max_length
cdef attr_t* _phrase_key cdef attr_t* _phrase_key
def __init__(self, Vocab vocab, phrases, max_length=10): cdef public object _callbacks
cdef public object _patterns
def __init__(self, Vocab vocab, max_length=10):
self.mem = Pool() self.mem = Pool()
self._phrase_key = <attr_t*>self.mem.alloc(max_length, sizeof(attr_t)) self._phrase_key = <attr_t*>self.mem.alloc(max_length, sizeof(attr_t))
self.max_length = max_length self.max_length = max_length
self.vocab = vocab self.vocab = vocab
self.matcher = Matcher(self.vocab, {}) self.matcher = Matcher(self.vocab)
self.phrase_ids = PreshMap() self.phrase_ids = PreshMap()
for phrase in phrases:
if len(phrase) < max_length:
self.add(phrase)
abstract_patterns = [] abstract_patterns = []
for length in range(1, max_length): for length in range(1, max_length):
abstract_patterns.append([{tag: True} for tag in get_bilou(length)]) abstract_patterns.append([{tag: True} for tag in get_bilou(length)])
self.matcher.add('Candidate', 'MWE', {}, abstract_patterns, acceptor=self.accept_match) self.matcher.add('Candidate', None, *abstract_patterns)
self._callbacks = {}
def add(self, Doc tokens): def __len__(self):
cdef int length = tokens.length raise NotImplementedError
assert length < self.max_length
tags = get_bilou(length)
assert len(tags) == length, length
def __contains__(self, key):
raise NotImplementedError
def __reduce__(self):
return (self.__class__, (self.vocab,), None, None)
def add(self, key, on_match, *docs):
cdef Doc doc
for doc in docs:
if len(doc) >= self.max_length:
msg = (
"Pattern length (%d) >= phrase_matcher.max_length (%d). "
"Length can be set on initialization, up to 10."
)
raise ValueError(msg % (len(doc), self.max_length))
cdef hash_t ent_id = self.matcher._normalize_key(key)
self._callbacks[ent_id] = on_match
cdef int length
cdef int i cdef int i
for i in range(self.max_length): cdef hash_t phrase_hash
self._phrase_key[i] = 0 for doc in docs:
for i, tag in enumerate(tags): length = doc.length
lexeme = self.vocab[tokens.c[i].lex.orth] tags = get_bilou(length)
lexeme.set_flag(tag, True) for i in range(self.max_length):
self._phrase_key[i] = lexeme.orth self._phrase_key[i] = 0
cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0) for i, tag in enumerate(tags):
self.phrase_ids[key] = True lexeme = self.vocab[doc.c[i].lex.orth]
lexeme.set_flag(tag, True)
self._phrase_key[i] = lexeme.orth
phrase_hash = hash64(self._phrase_key,
self.max_length * sizeof(attr_t), 0)
self.phrase_ids.set(phrase_hash, <void*>ent_id)
def __call__(self, Doc doc): def __call__(self, Doc doc):
matches = [] matches = []
for ent_id, label, start, end in self.matcher(doc): for _, start, end in self.matcher(doc):
cand = doc[start : end] ent_id = self.accept_match(doc, start, end)
start = cand[0].idx if ent_id is not None:
end = cand[-1].idx + len(cand[-1]) matches.append((ent_id, start, end))
matches.append((start, end, cand.root.tag_, cand.text, 'MWE')) for i, (ent_id, start, end) in enumerate(matches):
for match in matches: on_match = self._callbacks.get(ent_id)
doc.merge(*match) if on_match is not None:
on_match(self, doc, i, matches)
return matches return matches
def pipe(self, stream, batch_size=1000, n_threads=2): def pipe(self, stream, batch_size=1000, n_threads=2):
@ -469,7 +491,7 @@ cdef class PhraseMatcher:
self(doc) self(doc)
yield doc yield doc
def accept_match(self, Doc doc, attr_t ent_id, attr_t label, int start, int end): def accept_match(self, Doc doc, int start, int end):
assert (end - start) < self.max_length assert (end - start) < self.max_length
cdef int i, j cdef int i, j
for i in range(self.max_length): for i in range(self.max_length):
@ -477,7 +499,8 @@ cdef class PhraseMatcher:
for i, j in enumerate(range(start, end)): for i, j in enumerate(range(start, end)):
self._phrase_key[i] = doc.c[j].lex.orth self._phrase_key[i] = doc.c[j].lex.orth
cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0) cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0)
if self.phrase_ids.get(key): ent_id = <hash_t>self.phrase_ids.get(key)
return (ent_id, label, start, end) if ent_id == 0:
return None
else: else:
return False return ent_id

View File

@ -146,6 +146,8 @@ cdef class Morphology:
self.add_special_case(tag_str, form_str, attrs) self.add_special_case(tag_str, form_str, attrs)
def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology): def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
if orth not in self.strings:
return orth
cdef unicode py_string = self.strings[orth] cdef unicode py_string = self.strings[orth]
if self.lemmatizer is None: if self.lemmatizer is None:
return self.strings.add(py_string.lower()) return self.strings.add(py_string.lower())

View File

@ -4,7 +4,6 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from thinc.api import chain, layerize, with_getitem from thinc.api import chain, layerize, with_getitem
from thinc.neural import Model, Softmax
import numpy import numpy
cimport numpy as np cimport numpy as np
import cytoolz import cytoolz
@ -14,17 +13,18 @@ import ujson
import msgpack import msgpack
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
from thinc.neural import Model, Maxout, Softmax, Affine from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
from thinc.neural._classes.hash_embed import HashEmbed from thinc.i2v import HashEmbed
from thinc.t2v import Pooling, max_pool, mean_pool, sum_pool
from thinc.t2t import ExtractWindow, ParametricAttention
from thinc.misc import Residual
from thinc.misc import BatchNorm as BN
from thinc.misc import LayerNorm as LN
from thinc.neural.util import to_categorical from thinc.neural.util import to_categorical
from thinc.neural.pooling import Pooling, max_pool, mean_pool
from thinc.neural._classes.difference import Siamese, CauchySimilarity from thinc.neural._classes.difference import Siamese, CauchySimilarity
from thinc.neural._classes.convolution import ExtractWindow
from thinc.neural._classes.resnet import Residual
from thinc.neural._classes.batchnorm import BatchNorm as BN
from .tokens.doc cimport Doc from .tokens.doc cimport Doc
from .syntax.parser cimport Parser as LinearParser from .syntax.parser cimport Parser as LinearParser
from .syntax.nn_parser cimport Parser as NeuralParser from .syntax.nn_parser cimport Parser as NeuralParser
@ -41,13 +41,14 @@ from .syntax import nonproj
from .compat import json_dumps from .compat import json_dumps
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats from ._ml import rebatch, Tok2Vec, flatten
from ._ml import build_text_classifier, build_tagger_model from ._ml import build_text_classifier, build_tagger_model
from ._ml import link_vectors_to_models
from .parts_of_speech import X from .parts_of_speech import X
class SentenceSegmenter(object): class SentenceSegmenter(object):
'''A simple spaCy hook, to allow custom sentence boundary detection logic """A simple spaCy hook, to allow custom sentence boundary detection logic
(that doesn't require the dependency parse). (that doesn't require the dependency parse).
To change the sentence boundary detection strategy, pass a generator To change the sentence boundary detection strategy, pass a generator
@ -56,7 +57,7 @@ class SentenceSegmenter(object):
Sentence detection strategies should be generators that take `Doc` objects Sentence detection strategies should be generators that take `Doc` objects
and yield `Span` objects for each sentence. and yield `Span` objects for each sentence.
''' """
name = 'sbd' name = 'sbd'
def __init__(self, vocab, strategy=None): def __init__(self, vocab, strategy=None):
@ -88,17 +89,30 @@ class BaseThincComponent(object):
@classmethod @classmethod
def Model(cls, *shape, **kwargs): def Model(cls, *shape, **kwargs):
"""Initialize a model for the pipe."""
raise NotImplementedError raise NotImplementedError
def __init__(self, vocab, model=True, **cfg): def __init__(self, vocab, model=True, **cfg):
"""Create a new pipe instance."""
raise NotImplementedError raise NotImplementedError
def __call__(self, doc): def __call__(self, doc):
"""Apply the pipe to one document. The document is
modified in-place, and returned.
Both __call__ and pipe should delegate to the `predict()`
and `set_annotations()` methods.
"""
scores = self.predict([doc]) scores = self.predict([doc])
self.set_annotations([doc], scores) self.set_annotations([doc], scores)
return doc return doc
def pipe(self, stream, batch_size=128, n_threads=-1): def pipe(self, stream, batch_size=128, n_threads=-1):
"""Apply the pipe to a stream of documents.
Both __call__ and pipe should delegate to the `predict()`
and `set_annotations()` methods.
"""
for docs in cytoolz.partition_all(batch_size, stream): for docs in cytoolz.partition_all(batch_size, stream):
docs = list(docs) docs = list(docs)
scores = self.predict(docs) scores = self.predict(docs)
@ -106,27 +120,43 @@ class BaseThincComponent(object):
yield from docs yield from docs
def predict(self, docs): def predict(self, docs):
"""Apply the pipeline's model to a batch of docs, without
modifying them.
"""
raise NotImplementedError raise NotImplementedError
def set_annotations(self, docs, scores): def set_annotations(self, docs, scores):
"""Modify a batch of documents, using pre-computed scores."""
raise NotImplementedError raise NotImplementedError
def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None): def update(self, docs, golds, drop=0., sgd=None, losses=None):
"""Learn from a batch of documents and gold-standard information,
updating the pipe's model.
Delegates to predict() and get_loss().
"""
raise NotImplementedError raise NotImplementedError
def get_loss(self, docs, golds, scores): def get_loss(self, docs, golds, scores):
"""Find the loss and gradient of loss for the batch of
documents and their predicted scores."""
raise NotImplementedError raise NotImplementedError
def begin_training(self, gold_tuples=tuple(), pipeline=None): def begin_training(self, gold_tuples=tuple(), pipeline=None):
token_vector_width = pipeline[0].model.nO """Initialize the pipe for training, using data exampes if available.
If no model has been initialized yet, the model is added."""
if self.model is True: if self.model is True:
self.model = self.Model(1, token_vector_width) self.model = self.Model(**self.cfg)
link_vectors_to_models(self.vocab)
def use_params(self, params): def use_params(self, params):
"""Modify the pipe's model, to use the given parameter values.
"""
with self.model.use_params(params): with self.model.use_params(params):
yield yield
def to_bytes(self, **exclude): def to_bytes(self, **exclude):
"""Serialize the pipe to a bytestring."""
serialize = OrderedDict(( serialize = OrderedDict((
('cfg', lambda: json_dumps(self.cfg)), ('cfg', lambda: json_dumps(self.cfg)),
('model', lambda: self.model.to_bytes()), ('model', lambda: self.model.to_bytes()),
@ -135,37 +165,42 @@ class BaseThincComponent(object):
return util.to_bytes(serialize, exclude) return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, **exclude): def from_bytes(self, bytes_data, **exclude):
"""Load the pipe from a bytestring."""
def load_model(b): def load_model(b):
if self.model is True: if self.model is True:
self.cfg['pretrained_dims'] = self.vocab.vectors_length
self.model = self.Model(**self.cfg) self.model = self.Model(**self.cfg)
self.model.from_bytes(b) self.model.from_bytes(b)
deserialize = OrderedDict(( deserialize = OrderedDict((
('cfg', lambda b: self.cfg.update(ujson.loads(b))), ('cfg', lambda b: self.cfg.update(ujson.loads(b))),
('vocab', lambda b: self.vocab.from_bytes(b)),
('model', load_model), ('model', load_model),
('vocab', lambda b: self.vocab.from_bytes(b))
)) ))
util.from_bytes(bytes_data, deserialize, exclude) util.from_bytes(bytes_data, deserialize, exclude)
return self return self
def to_disk(self, path, **exclude): def to_disk(self, path, **exclude):
"""Serialize the pipe to disk."""
serialize = OrderedDict(( serialize = OrderedDict((
('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))), ('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))),
('vocab', lambda p: self.vocab.to_disk(p)),
('model', lambda p: p.open('wb').write(self.model.to_bytes())), ('model', lambda p: p.open('wb').write(self.model.to_bytes())),
('vocab', lambda p: self.vocab.to_disk(p))
)) ))
util.to_disk(path, serialize, exclude) util.to_disk(path, serialize, exclude)
def from_disk(self, path, **exclude): def from_disk(self, path, **exclude):
"""Load the pipe from disk."""
def load_model(p): def load_model(p):
if self.model is True: if self.model is True:
self.cfg['pretrained_dims'] = self.vocab.vectors_length
self.model = self.Model(**self.cfg) self.model = self.Model(**self.cfg)
self.model.from_bytes(p.open('rb').read()) self.model.from_bytes(p.open('rb').read())
deserialize = OrderedDict(( deserialize = OrderedDict((
('cfg', lambda p: self.cfg.update(_load_cfg(p))), ('cfg', lambda p: self.cfg.update(_load_cfg(p))),
('model', load_model),
('vocab', lambda p: self.vocab.from_disk(p)), ('vocab', lambda p: self.vocab.from_disk(p)),
('model', load_model),
)) ))
util.from_disk(path, deserialize, exclude) util.from_disk(path, deserialize, exclude)
return self return self
@ -193,7 +228,7 @@ class TokenVectorEncoder(BaseThincComponent):
""" """
width = util.env_opt('token_vector_width', width) width = util.env_opt('token_vector_width', width)
embed_size = util.env_opt('embed_size', embed_size) embed_size = util.env_opt('embed_size', embed_size)
return Tok2Vec(width, embed_size, preprocess=None) return Tok2Vec(width, embed_size, **cfg)
def __init__(self, vocab, model=True, **cfg): def __init__(self, vocab, model=True, **cfg):
"""Construct a new statistical model. Weights are not allocated on """Construct a new statistical model. Weights are not allocated on
@ -210,9 +245,10 @@ class TokenVectorEncoder(BaseThincComponent):
>>> tok2vec.model = tok2vec.Model(128, 5000) >>> tok2vec.model = tok2vec.Model(128, 5000)
""" """
self.vocab = vocab self.vocab = vocab
self.doc2feats = doc2feats()
self.model = model self.model = model
self.cfg = dict(cfg) self.cfg = dict(cfg)
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
self.cfg.setdefault('cnn_maxout_pieces', 3)
def __call__(self, doc): def __call__(self, doc):
"""Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
@ -245,8 +281,7 @@ class TokenVectorEncoder(BaseThincComponent):
docs (iterable): A sequence of `Doc` objects. docs (iterable): A sequence of `Doc` objects.
RETURNS (object): Vector representations for each token in the documents. RETURNS (object): Vector representations for each token in the documents.
""" """
feats = self.doc2feats(docs) tokvecs = self.model(docs)
tokvecs = self.model(feats)
return tokvecs return tokvecs
def set_annotations(self, docs, tokvecses): def set_annotations(self, docs, tokvecses):
@ -270,8 +305,7 @@ class TokenVectorEncoder(BaseThincComponent):
""" """
if isinstance(docs, Doc): if isinstance(docs, Doc):
docs = [docs] docs = [docs]
feats = self.doc2feats(docs) tokvecs, bp_tokvecs = self.model.begin_update(docs, drop=drop)
tokvecs, bp_tokvecs = self.model.begin_update(feats, drop=drop)
return tokvecs, bp_tokvecs return tokvecs, bp_tokvecs
def get_loss(self, docs, golds, scores): def get_loss(self, docs, golds, scores):
@ -285,9 +319,10 @@ class TokenVectorEncoder(BaseThincComponent):
gold_tuples (iterable): Gold-standard training data. gold_tuples (iterable): Gold-standard training data.
pipeline (list): The pipeline the model is part of. pipeline (list): The pipeline the model is part of.
""" """
self.doc2feats = doc2feats()
if self.model is True: if self.model is True:
self.model = self.Model() self.cfg['pretrained_dims'] = self.vocab.vectors_length
self.model = self.Model(**self.cfg)
link_vectors_to_models(self.vocab)
class NeuralTagger(BaseThincComponent): class NeuralTagger(BaseThincComponent):
@ -296,29 +331,29 @@ class NeuralTagger(BaseThincComponent):
self.vocab = vocab self.vocab = vocab
self.model = model self.model = model
self.cfg = dict(cfg) self.cfg = dict(cfg)
self.cfg.setdefault('cnn_maxout_pieces', 2)
self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
def __call__(self, doc): def __call__(self, doc):
tags = self.predict(([doc], [doc.tensor])) tags = self.predict([doc])
self.set_annotations([doc], tags) self.set_annotations([doc], tags)
return doc return doc
def pipe(self, stream, batch_size=128, n_threads=-1): def pipe(self, stream, batch_size=128, n_threads=-1):
for docs in cytoolz.partition_all(batch_size, stream): for docs in cytoolz.partition_all(batch_size, stream):
docs = list(docs) docs = list(docs)
tokvecs = [d.tensor for d in docs] tag_ids = self.predict(docs)
tag_ids = self.predict((docs, tokvecs))
self.set_annotations(docs, tag_ids) self.set_annotations(docs, tag_ids)
yield from docs yield from docs
def predict(self, docs_tokvecs): def predict(self, docs):
scores = self.model(docs_tokvecs) scores = self.model(docs)
scores = self.model.ops.flatten(scores) scores = self.model.ops.flatten(scores)
guesses = scores.argmax(axis=1) guesses = scores.argmax(axis=1)
if not isinstance(guesses, numpy.ndarray): if not isinstance(guesses, numpy.ndarray):
guesses = guesses.get() guesses = guesses.get()
tokvecs = docs_tokvecs[1]
guesses = self.model.ops.unflatten(guesses, guesses = self.model.ops.unflatten(guesses,
[tv.shape[0] for tv in tokvecs]) [len(d) for d in docs])
return guesses return guesses
def set_annotations(self, docs, batch_tag_ids): def set_annotations(self, docs, batch_tag_ids):
@ -338,20 +373,16 @@ class NeuralTagger(BaseThincComponent):
idx += 1 idx += 1
doc.is_tagged = True doc.is_tagged = True
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None): def update(self, docs, golds, drop=0., sgd=None, losses=None):
if losses is not None and self.name not in losses: if losses is not None and self.name not in losses:
losses[self.name] = 0. losses[self.name] = 0.
docs, tokvecs = docs_tokvecs
if self.model.nI is None: tag_scores, bp_tag_scores = self.model.begin_update(docs, drop=drop)
self.model.nI = tokvecs[0].shape[1]
tag_scores, bp_tag_scores = self.model.begin_update(docs_tokvecs, drop=drop)
loss, d_tag_scores = self.get_loss(docs, golds, tag_scores) loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
bp_tag_scores(d_tag_scores, sgd=sgd)
d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
if losses is not None: if losses is not None:
losses[self.name] += loss losses[self.name] += loss
return d_tokvecs
def get_loss(self, docs, golds, scores): def get_loss(self, docs, golds, scores):
scores = self.model.ops.flatten(scores) scores = self.model.ops.flatten(scores)
@ -392,13 +423,14 @@ class NeuralTagger(BaseThincComponent):
vocab.morphology = Morphology(vocab.strings, new_tag_map, vocab.morphology = Morphology(vocab.strings, new_tag_map,
vocab.morphology.lemmatizer, vocab.morphology.lemmatizer,
exc=vocab.morphology.exc) exc=vocab.morphology.exc)
token_vector_width = pipeline[0].model.nO
if self.model is True: if self.model is True:
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width) self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
link_vectors_to_models(self.vocab)
@classmethod @classmethod
def Model(cls, n_tags, token_vector_width): def Model(cls, n_tags, **cfg):
return build_tagger_model(n_tags, token_vector_width) return build_tagger_model(n_tags, **cfg)
def use_params(self, params): def use_params(self, params):
with self.model.use_params(params): with self.model.use_params(params):
@ -419,7 +451,7 @@ class NeuralTagger(BaseThincComponent):
if self.model is True: if self.model is True:
token_vector_width = util.env_opt('token_vector_width', token_vector_width = util.env_opt('token_vector_width',
self.cfg.get('token_vector_width', 128)) self.cfg.get('token_vector_width', 128))
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width) self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
self.model.from_bytes(b) self.model.from_bytes(b)
def load_tag_map(b): def load_tag_map(b):
@ -438,6 +470,7 @@ class NeuralTagger(BaseThincComponent):
return self return self
def to_disk(self, path, **exclude): def to_disk(self, path, **exclude):
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
serialize = OrderedDict(( serialize = OrderedDict((
('vocab', lambda p: self.vocab.to_disk(p)), ('vocab', lambda p: self.vocab.to_disk(p)),
('tag_map', lambda p: p.open('wb').write(msgpack.dumps( ('tag_map', lambda p: p.open('wb').write(msgpack.dumps(
@ -452,9 +485,7 @@ class NeuralTagger(BaseThincComponent):
def from_disk(self, path, **exclude): def from_disk(self, path, **exclude):
def load_model(p): def load_model(p):
if self.model is True: if self.model is True:
token_vector_width = util.env_opt('token_vector_width', self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
self.cfg.get('token_vector_width', 128))
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
self.model.from_bytes(p.open('rb').read()) self.model.from_bytes(p.open('rb').read())
def load_tag_map(p): def load_tag_map(p):
@ -466,10 +497,10 @@ class NeuralTagger(BaseThincComponent):
exc=self.vocab.morphology.exc) exc=self.vocab.morphology.exc)
deserialize = OrderedDict(( deserialize = OrderedDict((
('cfg', lambda p: self.cfg.update(_load_cfg(p))),
('vocab', lambda p: self.vocab.from_disk(p)), ('vocab', lambda p: self.vocab.from_disk(p)),
('tag_map', load_tag_map), ('tag_map', load_tag_map),
('model', load_model), ('model', load_model),
('cfg', lambda p: self.cfg.update(_load_cfg(p)))
)) ))
util.from_disk(path, deserialize, exclude) util.from_disk(path, deserialize, exclude)
return self return self
@ -477,10 +508,28 @@ class NeuralTagger(BaseThincComponent):
class NeuralLabeller(NeuralTagger): class NeuralLabeller(NeuralTagger):
name = 'nn_labeller' name = 'nn_labeller'
def __init__(self, vocab, model=True, **cfg): def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
self.vocab = vocab self.vocab = vocab
self.model = model self.model = model
if target == 'dep':
self.make_label = self.make_dep
elif target == 'tag':
self.make_label = self.make_tag
elif target == 'ent':
self.make_label = self.make_ent
elif target == 'dep_tag_offset':
self.make_label = self.make_dep_tag_offset
elif target == 'ent_tag':
self.make_label = self.make_ent_tag
elif hasattr(target, '__call__'):
self.make_label = target
else:
raise ValueError(
"NeuralLabeller target should be function or one of "
"['dep', 'tag', 'ent', 'dep_tag_offset', 'ent_tag']")
self.cfg = dict(cfg) self.cfg = dict(cfg)
self.cfg.setdefault('cnn_maxout_pieces', 2)
self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
@property @property
def labels(self): def labels(self):
@ -493,41 +542,79 @@ class NeuralLabeller(NeuralTagger):
def set_annotations(self, docs, dep_ids): def set_annotations(self, docs, dep_ids):
pass pass
def begin_training(self, gold_tuples=tuple(), pipeline=None): def begin_training(self, gold_tuples=tuple(), pipeline=None, tok2vec=None):
gold_tuples = nonproj.preprocess_training_data(gold_tuples) gold_tuples = nonproj.preprocess_training_data(gold_tuples)
for raw_text, annots_brackets in gold_tuples: for raw_text, annots_brackets in gold_tuples:
for annots, brackets in annots_brackets: for annots, brackets in annots_brackets:
ids, words, tags, heads, deps, ents = annots ids, words, tags, heads, deps, ents = annots
for dep in deps: for i in range(len(ids)):
if dep not in self.labels: label = self.make_label(i, words, tags, heads, deps, ents)
self.labels[dep] = len(self.labels) if label is not None and label not in self.labels:
token_vector_width = pipeline[0].model.nO self.labels[label] = len(self.labels)
print(len(self.labels))
if self.model is True: if self.model is True:
self.model = self.Model(len(self.labels), token_vector_width) token_vector_width = util.env_opt('token_vector_width')
self.model = chain(
tok2vec,
Softmax(len(self.labels), token_vector_width)
)
link_vectors_to_models(self.vocab)
@classmethod @classmethod
def Model(cls, n_tags, token_vector_width): def Model(cls, n_tags, tok2vec=None, **cfg):
return build_tagger_model(n_tags, token_vector_width) return build_tagger_model(n_tags, tok2vec=tok2vec, **cfg)
def get_loss(self, docs, golds, scores): def get_loss(self, docs, golds, scores):
scores = self.model.ops.flatten(scores)
cdef int idx = 0 cdef int idx = 0
correct = numpy.zeros((scores.shape[0],), dtype='i') correct = numpy.zeros((scores.shape[0],), dtype='i')
guesses = scores.argmax(axis=1) guesses = scores.argmax(axis=1)
for gold in golds: for gold in golds:
for tag in gold.labels: for i in range(len(gold.labels)):
if tag is None or tag not in self.labels: label = self.make_label(i, gold.words, gold.tags, gold.heads,
gold.labels, gold.ents)
if label is None or label not in self.labels:
correct[idx] = guesses[idx] correct[idx] = guesses[idx]
else: else:
correct[idx] = self.labels[tag] correct[idx] = self.labels[label]
idx += 1 idx += 1
correct = self.model.ops.xp.array(correct, dtype='i') correct = self.model.ops.xp.array(correct, dtype='i')
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1]) d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
d_scores /= d_scores.shape[0] d_scores /= d_scores.shape[0]
loss = (d_scores**2).sum() loss = (d_scores**2).sum()
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
return float(loss), d_scores return float(loss), d_scores
@staticmethod
def make_dep(i, words, tags, heads, deps, ents):
if deps[i] is None or heads[i] is None:
return None
return deps[i]
@staticmethod
def make_tag(i, words, tags, heads, deps, ents):
return tags[i]
@staticmethod
def make_ent(i, words, tags, heads, deps, ents):
if ents is None:
return None
return ents[i]
@staticmethod
def make_dep_tag_offset(i, words, tags, heads, deps, ents):
if deps[i] is None or heads[i] is None:
return None
offset = heads[i] - i
offset = min(offset, 2)
offset = max(offset, -2)
return '%s-%s:%d' % (deps[i], tags[i], offset)
@staticmethod
def make_ent_tag(i, words, tags, heads, deps, ents):
if ents is None or ents[i] is None:
return None
else:
return '%s-%s' % (tags[i], ents[i])
class SimilarityHook(BaseThincComponent): class SimilarityHook(BaseThincComponent):
""" """
@ -555,7 +642,7 @@ class SimilarityHook(BaseThincComponent):
return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length)) return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length))
def __call__(self, doc): def __call__(self, doc):
'''Install similarity hook''' """Install similarity hook"""
doc.user_hooks['similarity'] = self.predict doc.user_hooks['similarity'] = self.predict
return doc return doc
@ -564,15 +651,10 @@ class SimilarityHook(BaseThincComponent):
yield self(doc) yield self(doc)
def predict(self, doc1, doc2): def predict(self, doc1, doc2):
return self.model.predict([(doc1.tensor, doc2.tensor)]) return self.model.predict([(doc1, doc2)])
def update(self, doc1_tensor1_doc2_tensor2, golds, sgd=None, drop=0.): def update(self, doc1_doc2, golds, sgd=None, drop=0.):
doc1s, tensor1s, doc2s, tensor2s = doc1_tensor1_doc2_tensor2 sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop)
sims, bp_sims = self.model.begin_update(zip(tensor1s, tensor2s),
drop=drop)
d_tensor1s, d_tensor2s = bp_sims(golds, sgd=sgd)
return d_tensor1s, d_tensor2s
def begin_training(self, _=tuple(), pipeline=None): def begin_training(self, _=tuple(), pipeline=None):
""" """
@ -583,6 +665,7 @@ class SimilarityHook(BaseThincComponent):
""" """
if self.model is True: if self.model is True:
self.model = self.Model(pipeline[0].model.nO) self.model = self.Model(pipeline[0].model.nO)
link_vectors_to_models(self.vocab)
class TextCategorizer(BaseThincComponent): class TextCategorizer(BaseThincComponent):
@ -627,15 +710,13 @@ class TextCategorizer(BaseThincComponent):
for j, label in enumerate(self.labels): for j, label in enumerate(self.labels):
doc.cats[label] = float(scores[i, j]) doc.cats[label] = float(scores[i, j])
def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None): def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
docs, tensors = docs_tensors
scores, bp_scores = self.model.begin_update(docs, drop=drop) scores, bp_scores = self.model.begin_update(docs, drop=drop)
loss, d_scores = self.get_loss(docs, golds, scores) loss, d_scores = self.get_loss(docs, golds, scores)
d_tensors = bp_scores(d_scores, sgd=sgd) bp_scores(d_scores, sgd=sgd)
if losses is not None: if losses is not None:
losses.setdefault(self.name, 0.0) losses.setdefault(self.name, 0.0)
losses[self.name] += loss losses[self.name] += loss
return d_tensors
def get_loss(self, docs, golds, scores): def get_loss(self, docs, golds, scores):
truths = numpy.zeros((len(golds), len(self.labels)), dtype='f') truths = numpy.zeros((len(golds), len(self.labels)), dtype='f')
@ -653,8 +734,10 @@ class TextCategorizer(BaseThincComponent):
else: else:
token_vector_width = 64 token_vector_width = 64
if self.model is True: if self.model is True:
self.cfg['pretrained_dims'] = self.vocab.vectors_length
self.model = self.Model(len(self.labels), token_vector_width, self.model = self.Model(len(self.labels), token_vector_width,
**self.cfg) **self.cfg)
link_vectors_to_models(self.vocab)
cdef class EntityRecognizer(LinearParser): cdef class EntityRecognizer(LinearParser):
@ -695,6 +778,14 @@ cdef class NeuralDependencyParser(NeuralParser):
name = 'parser' name = 'parser'
TransitionSystem = ArcEager TransitionSystem = ArcEager
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
for target in []:
labeller = NeuralLabeller(self.vocab, target=target)
tok2vec = self.model[0]
labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
pipeline.append(labeller)
self._multitasks.append(labeller)
def __reduce__(self): def __reduce__(self):
return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None) return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None)
@ -705,13 +796,13 @@ cdef class NeuralEntityRecognizer(NeuralParser):
nr_feature = 6 nr_feature = 6
def predict_confidences(self, docs): def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
tensors = [d.tensor for d in docs] for target in []:
samples = [] labeller = NeuralLabeller(self.vocab, target=target)
for i in range(10): tok2vec = self.model[0]
states = self.parse_batch(docs, tensors, drop=0.3) labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
for state in states: pipeline.append(labeller)
samples.append(self._get_entities(state)) self._multitasks.append(labeller)
def __reduce__(self): def __reduce__(self):
return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None) return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None)

View File

@ -1,4 +1,4 @@
cpdef enum symbol_t: cdef enum symbol_t:
NIL NIL
IS_ALPHA IS_ALPHA
IS_ASCII IS_ASCII

View File

@ -1,4 +1,6 @@
# coding: utf8 # coding: utf8
#cython: optimize.unpack_method_calls=False
from __future__ import unicode_literals from __future__ import unicode_literals
IDS = { IDS = {
@ -458,4 +460,11 @@ IDS = {
"xcomp": xcomp "xcomp": xcomp
} }
NAMES = [it[0] for it in sorted(IDS.items(), key=lambda it: it[1])] def sort_nums(x):
return x[1]
NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)]
# Unfortunate hack here, to work around problem with long cpdef enum
# (which is generating an enormous amount of C++ in Cython 0.24+)
# We keep the enum cdef, and just make sure the names are available to Python
locals().update(IDS)

View File

@ -147,10 +147,10 @@ def get_token_ids(states, int n_tokens):
nr_update = 0 nr_update = 0
def update_beam(TransitionSystem moves, int nr_feature, int max_steps, def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
states, tokvecs, golds, states, golds,
state2vec, vec2scores, state2vec, vec2scores,
int width, float density, int width, float density,
sgd=None, losses=None, drop=0.): losses=None, drop=0.):
global nr_update global nr_update
cdef MaxViolation violn cdef MaxViolation violn
nr_update += 1 nr_update += 1

View File

@ -101,9 +101,10 @@ cdef cppclass StateC:
elif n == 6: elif n == 6:
if this.B(0) >= 0: if this.B(0) >= 0:
ids[0] = this.B(0) ids[0] = this.B(0)
ids[1] = this.B(0)-1
else: else:
ids[0] = -1 ids[0] = -1
ids[1] = this.B(0) ids[1] = -1
ids[2] = this.B(1) ids[2] = this.B(1)
ids[3] = this.E(0) ids[3] = this.E(0)
if ids[3] >= 1: if ids[3] >= 1:
@ -120,6 +121,8 @@ cdef cppclass StateC:
for i in range(n): for i in range(n):
if ids[i] >= 0: if ids[i] >= 0:
ids[i] += this.offset ids[i] += this.offset
else:
ids[i] = -1
int S(int i) nogil const: int S(int i) nogil const:
if i >= this._s_i: if i >= this._s_i:
@ -162,9 +165,9 @@ cdef cppclass StateC:
int E(int i) nogil const: int E(int i) nogil const:
if this._e_i <= 0 or this._e_i >= this.length: if this._e_i <= 0 or this._e_i >= this.length:
return 0 return -1
if i < 0 or i >= this._e_i: if i < 0 or i >= this._e_i:
return 0 return -1
return this._ents[this._e_i - (i+1)].start return this._ents[this._e_i - (i+1)].start
int L(int i, int idx) nogil const: int L(int i, int idx) nogil const:

View File

@ -161,8 +161,7 @@ cdef class BiluoPushDown(TransitionSystem):
cdef Transition lookup_transition(self, object name) except *: cdef Transition lookup_transition(self, object name) except *:
cdef attr_t label cdef attr_t label
if name == '-' or name == None: if name == '-' or name == None:
move_str = 'M' return Transition(clas=0, move=MISSING, label=0, score=0)
label = 0
elif name == '!O': elif name == '!O':
return Transition(clas=0, move=ISNT, label=0, score=0) return Transition(clas=0, move=ISNT, label=0, score=0)
elif '-' in name: elif '-' in name:
@ -220,6 +219,31 @@ cdef class BiluoPushDown(TransitionSystem):
raise Exception(move) raise Exception(move)
return t return t
#def add_action(self, int action, label_name):
# cdef attr_t label_id
# if not isinstance(label_name, (int, long)):
# label_id = self.strings.add(label_name)
# else:
# label_id = label_name
# if action == OUT and label_id != 0:
# return
# if action == MISSING or action == ISNT:
# return
# # Check we're not creating a move we already have, so that this is
# # idempotent
# for trans in self.c[:self.n_moves]:
# if trans.move == action and trans.label == label_id:
# return 0
# if self.n_moves >= self._size:
# self._size *= 2
# self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0]))
# self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
# assert self.c[self.n_moves].label == label_id
# self.n_moves += 1
# return 1
cdef int initialize_state(self, StateC* st) nogil: cdef int initialize_state(self, StateC* st) nogil:
# This is especially necessary when we use limited training data. # This is especially necessary when we use limited training data.
for i in range(st.length): for i in range(st.length):

View File

@ -13,6 +13,7 @@ cdef class Parser:
cdef public object model cdef public object model
cdef readonly TransitionSystem moves cdef readonly TransitionSystem moves
cdef readonly object cfg cdef readonly object cfg
cdef public object _multitasks
cdef void _parse_step(self, StateC* state, cdef void _parse_step(self, StateC* state,
const float* feat_weights, const float* feat_weights,

View File

@ -7,6 +7,7 @@ from __future__ import unicode_literals, print_function
from collections import Counter, OrderedDict from collections import Counter, OrderedDict
import ujson import ujson
import json
import contextlib import contextlib
from libc.math cimport exp from libc.math cimport exp
@ -37,10 +38,9 @@ from preshed.maps cimport MapStruct
from preshed.maps cimport map_get from preshed.maps cimport map_get
from thinc.api import layerize, chain, noop, clone, with_flatten from thinc.api import layerize, chain, noop, clone, with_flatten
from thinc.neural import Model, Affine, ReLu, Maxout from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
from thinc.neural._classes.batchnorm import BatchNorm as BN from thinc.misc import LayerNorm
from thinc.neural._classes.selu import SELU
from thinc.neural._classes.layernorm import LayerNorm
from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.util import get_array_module from thinc.neural.util import get_array_module
@ -48,7 +48,8 @@ from .. import util
from ..util import get_async, get_cuda_stream from ..util import get_async, get_cuda_stream
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune
from .._ml import Residual, drop_layer from .._ml import Residual, drop_layer, flatten
from .._ml import link_vectors_to_models
from ..compat import json_dumps from ..compat import json_dumps
from . import _parse_features from . import _parse_features
@ -238,14 +239,15 @@ cdef class Parser:
Base class of the DependencyParser and EntityRecognizer. Base class of the DependencyParser and EntityRecognizer.
""" """
@classmethod @classmethod
def Model(cls, nr_class, token_vector_width=128, hidden_width=300, depth=1, **cfg): def Model(cls, nr_class, token_vector_width=128, hidden_width=200, depth=1, **cfg):
depth = util.env_opt('parser_hidden_depth', depth) depth = util.env_opt('parser_hidden_depth', depth)
token_vector_width = util.env_opt('token_vector_width', token_vector_width) token_vector_width = util.env_opt('token_vector_width', token_vector_width)
hidden_width = util.env_opt('hidden_width', hidden_width) hidden_width = util.env_opt('hidden_width', hidden_width)
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2) parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
embed_size = util.env_opt('embed_size', 4000) embed_size = util.env_opt('embed_size', 7000)
tensors = fine_tune(Tok2Vec(token_vector_width, embed_size, tok2vec = Tok2Vec(token_vector_width, embed_size,
preprocess=doc2feats())) pretrained_dims=cfg.get('pretrained_dims', 0))
tok2vec = chain(tok2vec, flatten)
if parser_maxout_pieces == 1: if parser_maxout_pieces == 1:
lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class, lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
nF=cls.nr_feature, nF=cls.nr_feature,
@ -262,8 +264,8 @@ cdef class Parser:
upper.is_noop = True upper.is_noop = True
else: else:
upper = chain( upper = chain(
clone(Maxout(hidden_width), (depth-1)), clone(Maxout(hidden_width), depth-1),
zero_init(Affine(nr_class, drop_factor=0.0)) zero_init(Affine(nr_class, hidden_width, drop_factor=0.0))
) )
upper.is_noop = False upper.is_noop = False
# TODO: This is an unfortunate hack atm! # TODO: This is an unfortunate hack atm!
@ -277,7 +279,7 @@ cdef class Parser:
'hidden_width': hidden_width, 'hidden_width': hidden_width,
'maxout_pieces': parser_maxout_pieces 'maxout_pieces': parser_maxout_pieces
} }
return (tensors, lower, upper), cfg return (tok2vec, lower, upper), cfg
def __init__(self, Vocab vocab, moves=True, model=True, **cfg): def __init__(self, Vocab vocab, moves=True, model=True, **cfg):
""" """
@ -307,12 +309,16 @@ cdef class Parser:
cfg['beam_width'] = util.env_opt('beam_width', 1) cfg['beam_width'] = util.env_opt('beam_width', 1)
if 'beam_density' not in cfg: if 'beam_density' not in cfg:
cfg['beam_density'] = util.env_opt('beam_density', 0.0) cfg['beam_density'] = util.env_opt('beam_density', 0.0)
if 'pretrained_dims' not in cfg:
cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
cfg.setdefault('cnn_maxout_pieces', 3)
self.cfg = cfg self.cfg = cfg
if 'actions' in self.cfg: if 'actions' in self.cfg:
for action, labels in self.cfg.get('actions', {}).items(): for action, labels in self.cfg.get('actions', {}).items():
for label in labels: for label in labels:
self.moves.add_action(action, label) self.moves.add_action(action, label)
self.model = model self.model = model
self._multitasks = []
def __reduce__(self): def __reduce__(self):
return (Parser, (self.vocab, self.moves, self.model), None, None) return (Parser, (self.vocab, self.moves, self.model), None, None)
@ -332,11 +338,11 @@ cdef class Parser:
beam_density = self.cfg.get('beam_density', 0.0) beam_density = self.cfg.get('beam_density', 0.0)
cdef Beam beam cdef Beam beam
if beam_width == 1: if beam_width == 1:
states = self.parse_batch([doc], [doc.tensor]) states = self.parse_batch([doc])
self.set_annotations([doc], states) self.set_annotations([doc], states)
return doc return doc
else: else:
beam = self.beam_parse([doc], [doc.tensor], beam = self.beam_parse([doc],
beam_width=beam_width, beam_density=beam_density)[0] beam_width=beam_width, beam_density=beam_density)[0]
output = self.moves.get_beam_annot(beam) output = self.moves.get_beam_annot(beam)
state = <StateClass>beam.at(0) state = <StateClass>beam.at(0)
@ -365,11 +371,11 @@ cdef class Parser:
cdef Beam beam cdef Beam beam
for docs in cytoolz.partition_all(batch_size, docs): for docs in cytoolz.partition_all(batch_size, docs):
docs = list(docs) docs = list(docs)
tokvecs = [doc.tensor for doc in docs]
if beam_width == 1: if beam_width == 1:
parse_states = self.parse_batch(docs, tokvecs) parse_states = self.parse_batch(docs)
beams = []
else: else:
beams = self.beam_parse(docs, tokvecs, beams = self.beam_parse(docs,
beam_width=beam_width, beam_density=beam_density) beam_width=beam_width, beam_density=beam_density)
parse_states = [] parse_states = []
for beam in beams: for beam in beams:
@ -377,7 +383,7 @@ cdef class Parser:
self.set_annotations(docs, parse_states) self.set_annotations(docs, parse_states)
yield from docs yield from docs
def parse_batch(self, docs, tokvecses): def parse_batch(self, docs):
cdef: cdef:
precompute_hiddens state2vec precompute_hiddens state2vec
StateClass state StateClass state
@ -388,21 +394,15 @@ cdef class Parser:
int nr_class, nr_feat, nr_piece, nr_dim, nr_state int nr_class, nr_feat, nr_piece, nr_dim, nr_state
if isinstance(docs, Doc): if isinstance(docs, Doc):
docs = [docs] docs = [docs]
if isinstance(tokvecses, np.ndarray):
tokvecses = [tokvecses]
tokvecs = self.model[0].ops.flatten(tokvecses) cuda_stream = get_cuda_stream()
if USE_FINE_TUNE: (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses))) 0.0)
nr_state = len(docs) nr_state = len(docs)
nr_class = self.moves.n_moves nr_class = self.moves.n_moves
nr_dim = tokvecs.shape[1] nr_dim = tokvecs.shape[1]
nr_feat = self.nr_feature nr_feat = self.nr_feature
cuda_stream = get_cuda_stream()
state2vec, vec2scores = self.get_batch_model(nr_state, tokvecs,
cuda_stream, 0.0)
nr_piece = state2vec.nP nr_piece = state2vec.nP
states = self.moves.init_batch(docs) states = self.moves.init_batch(docs)
@ -418,21 +418,23 @@ cdef class Parser:
c_token_ids = <int*>token_ids.data c_token_ids = <int*>token_ids.data
c_is_valid = <int*>is_valid.data c_is_valid = <int*>is_valid.data
cdef int has_hidden = not getattr(vec2scores, 'is_noop', False) cdef int has_hidden = not getattr(vec2scores, 'is_noop', False)
cdef int nr_step
while not next_step.empty(): while not next_step.empty():
nr_step = next_step.size()
if not has_hidden: if not has_hidden:
for i in cython.parallel.prange( for i in cython.parallel.prange(nr_step, num_threads=6,
next_step.size(), num_threads=6, nogil=True): nogil=True):
self._parse_step(next_step[i], self._parse_step(next_step[i],
feat_weights, nr_class, nr_feat, nr_piece) feat_weights, nr_class, nr_feat, nr_piece)
else: else:
for i in range(next_step.size()): for i in range(nr_step):
st = next_step[i] st = next_step[i]
st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat) st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat)
self.moves.set_valid(&c_is_valid[i*nr_class], st) self.moves.set_valid(&c_is_valid[i*nr_class], st)
vectors = state2vec(token_ids[:next_step.size()]) vectors = state2vec(token_ids[:next_step.size()])
scores = vec2scores(vectors) scores = vec2scores(vectors)
c_scores = <float*>scores.data c_scores = <float*>scores.data
for i in range(next_step.size()): for i in range(nr_step):
st = next_step[i] st = next_step[i]
guess = arg_max_if_valid( guess = arg_max_if_valid(
&c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class) &c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class)
@ -445,18 +447,15 @@ cdef class Parser:
next_step.push_back(st) next_step.push_back(st)
return states return states
def beam_parse(self, docs, tokvecses, int beam_width=3, float beam_density=0.001): def beam_parse(self, docs, int beam_width=3, float beam_density=0.001):
cdef Beam beam cdef Beam beam
cdef np.ndarray scores cdef np.ndarray scores
cdef Doc doc cdef Doc doc
cdef int nr_class = self.moves.n_moves cdef int nr_class = self.moves.n_moves
cdef StateClass stcls, output cdef StateClass stcls, output
tokvecs = self.model[0].ops.flatten(tokvecses)
if USE_FINE_TUNE:
tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
cuda_stream = get_cuda_stream() cuda_stream = get_cuda_stream()
state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs, (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
cuda_stream, 0.0) 0.0)
beams = [] beams = []
cdef int offset = 0 cdef int offset = 0
cdef int j = 0 cdef int j = 0
@ -516,29 +515,24 @@ cdef class Parser:
free(scores) free(scores)
free(token_ids) free(token_ids)
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None): def update(self, docs, golds, drop=0., sgd=None, losses=None):
if not any(self.moves.has_gold(gold) for gold in golds): if not any(self.moves.has_gold(gold) for gold in golds):
return None return None
if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5: if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5:
return self.update_beam(docs_tokvecs, golds, return self.update_beam(docs, golds,
self.cfg['beam_width'], self.cfg['beam_density'], self.cfg['beam_width'], self.cfg['beam_density'],
drop=drop, sgd=sgd, losses=losses) drop=drop, sgd=sgd, losses=losses)
if losses is not None and self.name not in losses: if losses is not None and self.name not in losses:
losses[self.name] = 0. losses[self.name] = 0.
docs, tokvec_lists = docs_tokvecs
tokvecs = self.model[0].ops.flatten(tokvec_lists)
if isinstance(docs, Doc) and isinstance(golds, GoldParse): if isinstance(docs, Doc) and isinstance(golds, GoldParse):
docs = [docs] docs = [docs]
golds = [golds] golds = [golds]
if USE_FINE_TUNE:
my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
tokvecs = self.model[0].ops.flatten(my_tokvecs)
cuda_stream = get_cuda_stream() cuda_stream = get_cuda_stream()
states, golds, max_steps = self._init_gold_batch(docs, golds) states, golds, max_steps = self._init_gold_batch(docs, golds)
state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
0.0) drop)
todo = [(s, g) for (s, g) in zip(states, golds) todo = [(s, g) for (s, g) in zip(states, golds)
if not s.is_final() and g is not None] if not s.is_final() and g is not None]
if not todo: if not todo:
@ -582,13 +576,9 @@ cdef class Parser:
if n_steps >= max_steps: if n_steps >= max_steps:
break break
self._make_updates(d_tokvecs, self._make_updates(d_tokvecs,
backprops, sgd, cuda_stream) bp_tokvecs, backprops, sgd, cuda_stream)
d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
if USE_FINE_TUNE:
d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
return d_tokvecs
def update_beam(self, docs_tokvecs, golds, width=None, density=None, def update_beam(self, docs, golds, width=None, density=None,
drop=0., sgd=None, losses=None): drop=0., sgd=None, losses=None):
if not any(self.moves.has_gold(gold) for gold in golds): if not any(self.moves.has_gold(gold) for gold in golds):
return None return None
@ -600,26 +590,20 @@ cdef class Parser:
density = self.cfg.get('beam_density', 0.0) density = self.cfg.get('beam_density', 0.0)
if losses is not None and self.name not in losses: if losses is not None and self.name not in losses:
losses[self.name] = 0. losses[self.name] = 0.
docs, tokvecs = docs_tokvecs
lengths = [len(d) for d in docs] lengths = [len(d) for d in docs]
assert min(lengths) >= 1 assert min(lengths) >= 1
tokvecs = self.model[0].ops.flatten(tokvecs)
if USE_FINE_TUNE:
my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
tokvecs += self.model[0].ops.flatten(my_tokvecs)
states = self.moves.init_batch(docs) states = self.moves.init_batch(docs)
for gold in golds: for gold in golds:
self.moves.preprocess_gold(gold) self.moves.preprocess_gold(gold)
cuda_stream = get_cuda_stream() cuda_stream = get_cuda_stream()
state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, 0.0) (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, drop)
states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500, states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500,
states, tokvecs, golds, states, golds,
state2vec, vec2scores, state2vec, vec2scores,
width, density, width, density,
sgd=sgd, drop=drop, losses=losses) drop=drop, losses=losses)
backprop_lower = [] backprop_lower = []
cdef float batch_size = len(docs) cdef float batch_size = len(docs)
for i, d_scores in enumerate(states_d_scores): for i, d_scores in enumerate(states_d_scores):
@ -637,11 +621,7 @@ cdef class Parser:
else: else:
backprop_lower.append((ids, d_vector, bp_vectors)) backprop_lower.append((ids, d_vector, bp_vectors))
d_tokvecs = self.model[0].ops.allocate(tokvecs.shape) d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream) self._make_updates(d_tokvecs, bp_tokvecs, backprop_lower, sgd, cuda_stream)
d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, lengths)
if USE_FINE_TUNE:
d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
return d_tokvecs
def _init_gold_batch(self, whole_docs, whole_golds): def _init_gold_batch(self, whole_docs, whole_golds):
"""Make a square batch, of length equal to the shortest doc. A long """Make a square batch, of length equal to the shortest doc. A long
@ -679,7 +659,7 @@ cdef class Parser:
max_moves = max(max_moves, len(oracle_actions)) max_moves = max(max_moves, len(oracle_actions))
return states, golds, max_moves return states, golds, max_moves
def _make_updates(self, d_tokvecs, backprops, sgd, cuda_stream=None): def _make_updates(self, d_tokvecs, bp_tokvecs, backprops, sgd, cuda_stream=None):
# Tells CUDA to block, so our async copies complete. # Tells CUDA to block, so our async copies complete.
if cuda_stream is not None: if cuda_stream is not None:
cuda_stream.synchronize() cuda_stream.synchronize()
@ -690,6 +670,7 @@ cdef class Parser:
d_state_features *= mask.reshape(ids.shape + (1,)) d_state_features *= mask.reshape(ids.shape + (1,))
self.model[0].ops.scatter_add(d_tokvecs, ids * mask, self.model[0].ops.scatter_add(d_tokvecs, ids * mask,
d_state_features) d_state_features)
bp_tokvecs(d_tokvecs, sgd=sgd)
@property @property
def move_names(self): def move_names(self):
@ -699,11 +680,12 @@ cdef class Parser:
names.append(name) names.append(name)
return names return names
def get_batch_model(self, batch_size, tokvecs, stream, dropout): def get_batch_model(self, docs, stream, dropout):
_, lower, upper = self.model tok2vec, lower, upper = self.model
state2vec = precompute_hiddens(batch_size, tokvecs, tokvecs, bp_tokvecs = tok2vec.begin_update(docs, drop=dropout)
lower, stream, drop=dropout) state2vec = precompute_hiddens(len(docs), tokvecs,
return state2vec, upper lower, stream, drop=0.0)
return (tokvecs, bp_tokvecs), state2vec, upper
nr_feature = 8 nr_feature = 8
@ -766,7 +748,7 @@ cdef class Parser:
# order, or the model goes out of synch # order, or the model goes out of synch
self.cfg.setdefault('extra_labels', []).append(label) self.cfg.setdefault('extra_labels', []).append(label)
def begin_training(self, gold_tuples, **cfg): def begin_training(self, gold_tuples, pipeline=None, **cfg):
if 'model' in cfg: if 'model' in cfg:
self.model = cfg['model'] self.model = cfg['model']
gold_tuples = nonproj.preprocess_training_data(gold_tuples) gold_tuples = nonproj.preprocess_training_data(gold_tuples)
@ -775,9 +757,22 @@ cdef class Parser:
for label in labels: for label in labels:
self.moves.add_action(action, label) self.moves.add_action(action, label)
if self.model is True: if self.model is True:
cfg['pretrained_dims'] = self.vocab.vectors_length
self.model, cfg = self.Model(self.moves.n_moves, **cfg) self.model, cfg = self.Model(self.moves.n_moves, **cfg)
self.init_multitask_objectives(gold_tuples, pipeline, **cfg)
link_vectors_to_models(self.vocab)
self.cfg.update(cfg) self.cfg.update(cfg)
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
'''Setup models for secondary objectives, to benefit from multi-task
learning. This method is intended to be overridden by subclasses.
For instance, the dependency parser can benefit from sharing
an input representation with a label prediction model. These auxiliary
models are discarded after training.
'''
pass
def preprocess_gold(self, docs_golds): def preprocess_gold(self, docs_golds):
for doc, gold in docs_golds: for doc, gold in docs_golds:
yield doc, gold yield doc, gold
@ -813,6 +808,7 @@ cdef class Parser:
if 'model' not in exclude: if 'model' not in exclude:
path = util.ensure_path(path) path = util.ensure_path(path)
if self.model is True: if self.model is True:
self.cfg['pretrained_dims'] = self.vocab.vectors_length
self.model, cfg = self.Model(**self.cfg) self.model, cfg = self.Model(**self.cfg)
else: else:
cfg = {} cfg = {}
@ -835,7 +831,7 @@ cdef class Parser:
('upper_model', lambda: self.model[2].to_bytes()), ('upper_model', lambda: self.model[2].to_bytes()),
('vocab', lambda: self.vocab.to_bytes()), ('vocab', lambda: self.vocab.to_bytes()),
('moves', lambda: self.moves.to_bytes(strings=False)), ('moves', lambda: self.moves.to_bytes(strings=False)),
('cfg', lambda: ujson.dumps(self.cfg)) ('cfg', lambda: json.dumps(self.cfg, indent=2, sort_keys=True))
)) ))
if 'model' in exclude: if 'model' in exclude:
exclude['tok2vec_model'] = True exclude['tok2vec_model'] = True
@ -848,7 +844,7 @@ cdef class Parser:
deserializers = OrderedDict(( deserializers = OrderedDict((
('vocab', lambda b: self.vocab.from_bytes(b)), ('vocab', lambda b: self.vocab.from_bytes(b)),
('moves', lambda b: self.moves.from_bytes(b, strings=False)), ('moves', lambda b: self.moves.from_bytes(b, strings=False)),
('cfg', lambda b: self.cfg.update(ujson.loads(b))), ('cfg', lambda b: self.cfg.update(json.loads(b))),
('tok2vec_model', lambda b: None), ('tok2vec_model', lambda b: None),
('lower_model', lambda b: None), ('lower_model', lambda b: None),
('upper_model', lambda b: None) ('upper_model', lambda b: None)
@ -856,9 +852,11 @@ cdef class Parser:
msg = util.from_bytes(bytes_data, deserializers, exclude) msg = util.from_bytes(bytes_data, deserializers, exclude)
if 'model' not in exclude: if 'model' not in exclude:
if self.model is True: if self.model is True:
self.model, cfg = self.Model(self.moves.n_moves) self.model, cfg = self.Model(**self.cfg)
cfg['pretrained_dims'] = self.vocab.vectors_length
else: else:
cfg = {} cfg = {}
cfg['pretrained_dims'] = self.vocab.vectors_length
if 'tok2vec_model' in msg: if 'tok2vec_model' in msg:
self.model[0].from_bytes(msg['tok2vec_model']) self.model[0].from_bytes(msg['tok2vec_model'])
if 'lower_model' in msg: if 'lower_model' in msg:

View File

@ -148,7 +148,7 @@ cdef class TransitionSystem:
def add_action(self, int action, label_name): def add_action(self, int action, label_name):
cdef attr_t label_id cdef attr_t label_id
if not isinstance(label_name, int): if not isinstance(label_name, (int, long)):
label_id = self.strings.add(label_name) label_id = self.strings.add(label_name)
else: else:
label_id = label_name label_id = label_name

View File

@ -12,7 +12,7 @@ from .. import util
_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'id', _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'id',
'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx'] 'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'th','xx']
_models = {'en': ['en_core_web_sm'], _models = {'en': ['en_core_web_sm'],
'de': ['de_core_news_md'], 'de': ['de_core_news_md'],
'fr': ['fr_depvec_web_lg'], 'fr': ['fr_depvec_web_lg'],
@ -108,6 +108,11 @@ def he_tokenizer():
def nb_tokenizer(): def nb_tokenizer():
return util.get_lang_class('nb').Defaults.create_tokenizer() return util.get_lang_class('nb').Defaults.create_tokenizer()
@pytest.fixture
def th_tokenizer():
pythainlp = pytest.importorskip("pythainlp")
return util.get_lang_class('th').Defaults.create_tokenizer()
@pytest.fixture @pytest.fixture
def stringstore(): def stringstore():

View File

@ -67,12 +67,6 @@ def test_tokenizer_splits_uneven_wrap_interact(de_tokenizer, text):
assert len(tokens) == 4 assert len(tokens) == 4
@pytest.mark.parametrize('text', ["blau-rot"])
def test_tokenizer_splits_hyphens(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"]) @pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"])
def test_tokenizer_splits_numeric_range(de_tokenizer, text): def test_tokenizer_splits_numeric_range(de_tokenizer, text):
tokens = de_tokenizer(text) tokens = de_tokenizer(text)
@ -100,17 +94,21 @@ def test_tokenizer_splits_ellipsis_infix(de_tokenizer, text):
assert len(tokens) == 3 assert len(tokens) == 3
@pytest.mark.parametrize('text', ['Islam-Konferenz', 'Ost-West-Konflikt'])
def test_tokenizer_keeps_hyphens(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 1
def test_tokenizer_splits_double_hyphen_infix(de_tokenizer): def test_tokenizer_splits_double_hyphen_infix(de_tokenizer):
tokens = de_tokenizer("Viele Regeln--wie die Bindestrich-Regeln--sind kompliziert.") tokens = de_tokenizer("Viele Regeln--wie die Bindestrich-Regeln--sind kompliziert.")
assert len(tokens) == 12 assert len(tokens) == 10
assert tokens[0].text == "Viele" assert tokens[0].text == "Viele"
assert tokens[1].text == "Regeln" assert tokens[1].text == "Regeln"
assert tokens[2].text == "--" assert tokens[2].text == "--"
assert tokens[3].text == "wie" assert tokens[3].text == "wie"
assert tokens[4].text == "die" assert tokens[4].text == "die"
assert tokens[5].text == "Bindestrich" assert tokens[5].text == "Bindestrich-Regeln"
assert tokens[6].text == "-" assert tokens[6].text == "--"
assert tokens[7].text == "Regeln" assert tokens[7].text == "sind"
assert tokens[8].text == "--" assert tokens[8].text == "kompliziert"
assert tokens[9].text == "sind"
assert tokens[10].text == "kompliziert"

View File

@ -25,15 +25,15 @@ Umfang kläglich dünnen Beine flimmerten ihm hilflos vor den Augen.
assert len(tokens) == 109 assert len(tokens) == 109
@pytest.mark.parametrize('text,length', [ @pytest.mark.parametrize('text', [
("Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten", 1), "Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten",
("Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz", 1), "Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz",
("Kraftfahrzeug-Haftpflichtversicherung", 3), "Kraftfahrzeug-Haftpflichtversicherung",
("Vakuum-Mittelfrequenz-Induktionsofen", 5) "Vakuum-Mittelfrequenz-Induktionsofen"
]) ])
def test_tokenizer_handles_long_words(de_tokenizer, text, length): def test_tokenizer_handles_long_words(de_tokenizer, text):
tokens = de_tokenizer(text) tokens = de_tokenizer(text)
assert len(tokens) == length assert len(tokens) == 1
@pytest.mark.parametrize('text,length', [ @pytest.mark.parametrize('text,length', [

View File

View File

@ -0,0 +1,13 @@
# coding: utf8
from __future__ import unicode_literals
import pytest
TOKENIZER_TESTS = [
("คุณรักผมไหม", ['คุณ', 'รัก', 'ผม', 'ไหม'])
]
@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS)
def test_thai_tokenizer(th_tokenizer, text, expected_tokens):
tokens = [token.text for token in th_tokenizer(text)]
assert tokens == expected_tokens

View File

@ -26,7 +26,7 @@ def arc_eager(vocab):
@pytest.fixture @pytest.fixture
def tok2vec(): def tok2vec():
return Tok2Vec(8, 100, preprocess=doc2feats()) return Tok2Vec(8, 100)
@pytest.fixture @pytest.fixture
@ -61,33 +61,22 @@ def test_predict_doc(parser, tok2vec, model, doc):
parser(doc) parser(doc)
def test_update_doc(parser, tok2vec, model, doc, gold): def test_update_doc(parser, model, doc, gold):
parser.model = model parser.model = model
tokvecs, bp_tokvecs = tok2vec.begin_update([doc])
d_tokvecs = parser.update(([doc], tokvecs), [gold])
assert d_tokvecs[0].shape == tokvecs[0].shape
def optimize(weights, gradient, key=None): def optimize(weights, gradient, key=None):
weights -= 0.001 * gradient weights -= 0.001 * gradient
bp_tokvecs(d_tokvecs, sgd=optimize) parser.update([doc], [gold], sgd=optimize)
assert d_tokvecs[0].sum() == 0.
def test_predict_doc_beam(parser, tok2vec, model, doc): def test_predict_doc_beam(parser, model, doc):
doc.tensor = tok2vec([doc])[0]
parser.model = model parser.model = model
parser(doc, beam_width=32, beam_density=0.001) parser(doc, beam_width=32, beam_density=0.001)
for word in doc:
print(word.text, word.head, word.dep_)
def test_update_doc_beam(parser, tok2vec, model, doc, gold): def test_update_doc_beam(parser, model, doc, gold):
parser.model = model parser.model = model
tokvecs, bp_tokvecs = tok2vec.begin_update([doc])
d_tokvecs = parser.update_beam(([doc], tokvecs), [gold])
assert d_tokvecs[0].shape == tokvecs[0].shape
def optimize(weights, gradient, key=None): def optimize(weights, gradient, key=None):
weights -= 0.001 * gradient weights -= 0.001 * gradient
bp_tokvecs(d_tokvecs, sgd=optimize) parser.update_beam([doc], [gold], sgd=optimize)
assert d_tokvecs[0].sum() == 0.

View File

@ -0,0 +1,8 @@
import pytest
@pytest.mark.models('en')
def test_issue1305(EN):
'''Test lemmatization of English VBZ'''
assert EN.vocab.morphology.lemmatizer('works', 'verb') == set(['work'])
doc = EN(u'This app works well')
assert doc[2].lemma_ == 'work'

View File

@ -0,0 +1,14 @@
from __future__ import unicode_literals
import pytest
from ...language import Language
def test_issue1380_empty_string():
nlp = Language()
doc = nlp('')
assert len(doc) == 0
@pytest.mark.models('en')
def test_issue1380_en(EN):
doc = EN('')
assert len(doc) == 0

View File

@ -9,11 +9,14 @@ import pytest
@pytest.mark.models('en') @pytest.mark.models('en')
def test_issue429(EN): def test_issue429(EN):
def merge_phrases(matcher, doc, i, matches): def merge_phrases(matcher, doc, i, matches):
if i != len(matches) - 1: if i != len(matches) - 1:
return None return None
spans = [(ent_id, ent_id, doc[start:end]) for ent_id, start, end in matches] spans = [(ent_id, ent_id, doc[start:end]) for ent_id, start, end in matches]
for ent_id, label, span in spans: for ent_id, label, span in spans:
span.merge('NNP' if label else span.root.tag_, span.text, EN.vocab.strings[label]) span.merge(
tag=('NNP' if label else span.root.tag_),
lemma=span.text,
label='PERSON')
doc = EN('a') doc = EN('a')
matcher = Matcher(EN.vocab) matcher = Matcher(EN.vocab)

View File

@ -11,7 +11,7 @@ import pytest
def taggers(en_vocab): def taggers(en_vocab):
tagger1 = Tagger(en_vocab) tagger1 = Tagger(en_vocab)
tagger2 = Tagger(en_vocab) tagger2 = Tagger(en_vocab)
tagger1.model = tagger1.Model(8, 8) tagger1.model = tagger1.Model(8)
tagger2.model = tagger1.model tagger2.model = tagger1.model
return (tagger1, tagger2) return (tagger1, tagger2)

View File

@ -6,6 +6,16 @@ from ...strings import StringStore
import pytest import pytest
def test_string_hash(stringstore):
'''Test that string hashing is stable across platforms'''
ss = stringstore
assert ss.add('apple') == 8566208034543834098
heart = '\U0001f499'
print(heart)
h = ss.add(heart)
assert h == 11841826740069053588
def test_stringstore_from_api_docs(stringstore): def test_stringstore_from_api_docs(stringstore):
apple_hash = stringstore.add('apple') apple_hash = stringstore.add('apple')
assert apple_hash == 8566208034543834098 assert apple_hash == 8566208034543834098

View File

@ -34,7 +34,6 @@ def test_matcher_from_api_docs(en_vocab):
assert len(patterns[0]) assert len(patterns[0])
@pytest.mark.xfail
def test_matcher_from_usage_docs(en_vocab): def test_matcher_from_usage_docs(en_vocab):
text = "Wow 😀 This is really cool! 😂 😂" text = "Wow 😀 This is really cool! 😂 😂"
doc = get_doc(en_vocab, words=text.split(' ')) doc = get_doc(en_vocab, words=text.split(' '))
@ -46,7 +45,8 @@ def test_matcher_from_usage_docs(en_vocab):
if doc.vocab.strings[match_id] == 'HAPPY': if doc.vocab.strings[match_id] == 'HAPPY':
doc.sentiment += 0.1 doc.sentiment += 0.1
span = doc[start : end] span = doc[start : end]
token = span.merge(norm='happy emoji') token = span.merge()
token.vocab[token.text].norm_ = 'happy emoji'
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
matcher.add('HAPPY', label_sentiment, *pos_patterns) matcher.add('HAPPY', label_sentiment, *pos_patterns)
@ -98,11 +98,11 @@ def test_matcher_match_multi(matcher):
(doc.vocab.strings['Java'], 5, 6)] (doc.vocab.strings['Java'], 5, 6)]
@pytest.mark.xfail
def test_matcher_phrase_matcher(en_vocab): def test_matcher_phrase_matcher(en_vocab):
words = ["Google", "Now"] words = ["Google", "Now"]
doc = get_doc(en_vocab, words) doc = get_doc(en_vocab, words)
matcher = PhraseMatcher(en_vocab, [doc]) matcher = PhraseMatcher(en_vocab)
matcher.add('COMPANY', None, doc)
words = ["I", "like", "Google", "Now", "best"] words = ["I", "like", "Google", "Now", "best"]
doc = get_doc(en_vocab, words) doc = get_doc(en_vocab, words)
assert len(matcher(doc)) == 1 assert len(matcher(doc)) == 1

View File

@ -9,7 +9,8 @@ from .util import get_doc
from pathlib import Path from pathlib import Path
import pytest import pytest
from thinc.neural import Maxout, Softmax from thinc.neural._classes.maxout import Maxout
from thinc.neural._classes.softmax import Softmax
from thinc.api import chain from thinc.api import chain

View File

@ -1,6 +1,7 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import sys
import pytest import pytest
@ -37,9 +38,10 @@ def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
tokens = tokenizer(text) tokens = tokenizer(text)
assert len(tokens) == length assert len(tokens) == length
@pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8), @pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8),
('i💙you', 3), ('🤘🤘yay!', 4)]) ('i💙you', 3), ('🤘🤘yay!', 4)])
def test_tokenizer_handles_emoji(tokenizer, text, length): def test_tokenizer_handles_emoji(tokenizer, text, length):
tokens = tokenizer(text) # These break on narrow unicode builds, e.g. Windows
assert len(tokens) == length if sys.maxunicode >= 1114111:
tokens = tokenizer(text)
assert len(tokens) == length

View File

@ -54,7 +54,7 @@ cdef class Doc:
cdef public object noun_chunks_iterator cdef public object noun_chunks_iterator
cdef int push_back(self, LexemeOrToken lex_or_tok, bint trailing_space) except -1 cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1
cpdef np.ndarray to_array(self, object features) cpdef np.ndarray to_array(self, object features)

View File

@ -660,7 +660,7 @@ cdef class Doc:
""" """
with path.open('rb') as file_: with path.open('rb') as file_:
bytes_data = file_.read() bytes_data = file_.read()
self.from_bytes(bytes_data, **exclude) return self.from_bytes(bytes_data, **exclude)
def to_bytes(self, **exclude): def to_bytes(self, **exclude):
"""Serialize, i.e. export the document contents to a binary string. """Serialize, i.e. export the document contents to a binary string.

View File

@ -3,7 +3,7 @@ from __future__ import unicode_literals, print_function
import os import os
import ujson import ujson
import pip import pkg_resources
import importlib import importlib
import regex as re import regex as re
from pathlib import Path from pathlib import Path
@ -14,6 +14,7 @@ import numpy
import io import io
import dill import dill
from collections import OrderedDict from collections import OrderedDict
from thinc.neural._classes.model import Model
import msgpack import msgpack
import msgpack_numpy import msgpack_numpy
@ -180,9 +181,10 @@ def is_package(name):
name (unicode): Name of package. name (unicode): Name of package.
RETURNS (bool): True if installed package, False if not. RETURNS (bool): True if installed package, False if not.
""" """
packages = pip.get_installed_distributions() name = name.lower() # compare package name against lowercase name
packages = pkg_resources.working_set.by_key.keys()
for package in packages: for package in packages:
if package.project_name.replace('-', '_') == name: if package.lower().replace('-', '_') == name:
return True return True
return False return False
@ -193,6 +195,7 @@ def get_package_path(name):
name (unicode): Package name. name (unicode): Package name.
RETURNS (Path): Path to installed package. RETURNS (Path): Path to installed package.
""" """
name = name.lower() # use lowercase version to be safe
# Here we're importing the module just to find it. This is worryingly # Here we're importing the module just to find it. This is worryingly
# indirect, but it's otherwise very difficult to find the package. # indirect, but it's otherwise very difficult to find the package.
pkg = importlib.import_module(name) pkg = importlib.import_module(name)
@ -557,3 +560,17 @@ def minify_html(html):
RETURNS (unicode): "Minified" HTML. RETURNS (unicode): "Minified" HTML.
""" """
return html.strip().replace(' ', '').replace('\n', '') return html.strip().replace(' ', '').replace('\n', '')
def use_gpu(gpu_id):
try:
import cupy.cuda.device
except ImportError:
return None
from thinc.neural.ops import CupyOps
device = cupy.cuda.device.Device(gpu_id)
device.use()
Model.ops = CupyOps()
Model.Ops = CupyOps
return device

View File

@ -6,6 +6,8 @@ import msgpack
import msgpack_numpy import msgpack_numpy
msgpack_numpy.patch() msgpack_numpy.patch()
cimport numpy as np cimport numpy as np
from thinc.neural.util import get_array_module
from thinc.neural._classes.model import Model
from .typedefs cimport attr_t from .typedefs cimport attr_t
from .strings cimport StringStore from .strings cimport StringStore
@ -14,15 +16,29 @@ from .compat import basestring_
cdef class Vectors: cdef class Vectors:
'''Store, save and load word vectors.''' '''Store, save and load word vectors.
Vectors data is kept in the vectors.data attribute, which should be an
instance of numpy.ndarray (for CPU vectors)
or cupy.ndarray (for GPU vectors).
vectors.key2row is a dictionary mapping word hashes to rows
in the vectors.data table. The array `vectors.keys` keeps
the keys in order, such that keys[vectors.key2row[key]] == key.
'''
cdef public object data cdef public object data
cdef readonly StringStore strings cdef readonly StringStore strings
cdef public object key2row cdef public object key2row
cdef public object keys cdef public object keys
cdef public int i cdef public int i
def __init__(self, strings, data_or_width): def __init__(self, strings, data_or_width=0):
self.strings = StringStore() if isinstance(strings, StringStore):
self.strings = strings
else:
self.strings = StringStore()
for string in strings:
self.strings.add(string)
if isinstance(data_or_width, int): if isinstance(data_or_width, int):
self.data = data = numpy.zeros((len(strings), data_or_width), self.data = data = numpy.zeros((len(strings), data_or_width),
dtype='f') dtype='f')
@ -37,6 +53,11 @@ cdef class Vectors:
return (Vectors, (self.strings, self.data)) return (Vectors, (self.strings, self.data))
def __getitem__(self, key): def __getitem__(self, key):
'''Get a vector by key. If key is a string, it is hashed
to an integer ID using the vectors.strings table.
If the integer key is not found in the table, a KeyError is raised.
'''
if isinstance(key, basestring): if isinstance(key, basestring):
key = self.strings[key] key = self.strings[key]
i = self.key2row[key] i = self.key2row[key]
@ -46,23 +67,30 @@ cdef class Vectors:
return self.data[i] return self.data[i]
def __setitem__(self, key, vector): def __setitem__(self, key, vector):
'''Set a vector for the given key. If key is a string, it is hashed
to an integer ID using the vectors.strings table.
'''
if isinstance(key, basestring): if isinstance(key, basestring):
key = self.strings.add(key) key = self.strings.add(key)
i = self.key2row[key] i = self.key2row[key]
self.data[i] = vector self.data[i] = vector
def __iter__(self): def __iter__(self):
'''Yield vectors from the table.'''
yield from self.data yield from self.data
def __len__(self): def __len__(self):
'''Return the number of vectors that have been assigned.'''
return self.i return self.i
def __contains__(self, key): def __contains__(self, key):
'''Check whether a key has a vector entry in the table.'''
if isinstance(key, basestring_): if isinstance(key, basestring_):
key = self.strings[key] key = self.strings[key]
return key in self.key2row return key in self.key2row
def add(self, key, vector=None): def add(self, key, vector=None):
'''Add a key to the table, optionally setting a vector value as well.'''
if isinstance(key, basestring_): if isinstance(key, basestring_):
key = self.strings.add(key) key = self.strings.add(key)
if key not in self.key2row: if key not in self.key2row:
@ -80,7 +108,9 @@ cdef class Vectors:
return i return i
def items(self): def items(self):
for i, string in enumerate(self.strings): '''Iterate over (string key, vector) pairs, in order.'''
for i, key in enumerate(self.keys):
string = self.strings[key]
yield string, self.data[i] yield string, self.data[i]
@property @property
@ -118,9 +148,14 @@ cdef class Vectors:
self.data self.data
def to_disk(self, path, **exclude): def to_disk(self, path, **exclude):
xp = get_array_module(self.data)
if xp is numpy:
save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False)
else:
save_array = lambda arr, file_: xp.save(file_, arr)
serializers = OrderedDict(( serializers = OrderedDict((
('vectors', lambda p: numpy.save(p.open('wb'), self.data, allow_pickle=False)), ('vectors', lambda p: save_array(self.data, p.open('wb'))),
('keys', lambda p: numpy.save(p.open('wb'), self.keys, allow_pickle=False)), ('keys', lambda p: xp.save(p.open('wb'), self.keys))
)) ))
return util.to_disk(path, serializers, exclude) return util.to_disk(path, serializers, exclude)
@ -133,8 +168,9 @@ cdef class Vectors:
self.key2row[key] = i self.key2row[key] = i
def load_vectors(path): def load_vectors(path):
xp = Model.ops.xp
if path.exists(): if path.exists():
self.data = numpy.load(path) self.data = xp.load(path)
serializers = OrderedDict(( serializers = OrderedDict((
('keys', load_keys), ('keys', load_keys),

View File

@ -27,6 +27,7 @@ from .vectors import Vectors
from . import util from . import util
from . import attrs from . import attrs
from . import symbols from . import symbols
from ._ml import link_vectors_to_models
cdef class Vocab: cdef class Vocab:
@ -65,7 +66,7 @@ cdef class Vocab:
self.strings.add(name) self.strings.add(name)
self.lex_attr_getters = lex_attr_getters self.lex_attr_getters = lex_attr_getters
self.morphology = Morphology(self.strings, tag_map, lemmatizer) self.morphology = Morphology(self.strings, tag_map, lemmatizer)
self.vectors = Vectors(self.strings, 300) self.vectors = Vectors(self.strings)
property lang: property lang:
def __get__(self): def __get__(self):
@ -261,7 +262,7 @@ cdef class Vocab:
Words can be looked up by string or int ID. Words can be looked up by string or int ID.
RETURNS: RETURNS:
A word vector. Size and shape determed by the A word vector. Size and shape determined by the
vocab.vectors instance. Usually, a numpy ndarray vocab.vectors instance. Usually, a numpy ndarray
of shape (300,) and dtype float32. of shape (300,) and dtype float32.
@ -323,6 +324,7 @@ cdef class Vocab:
self.lexemes_from_bytes(file_.read()) self.lexemes_from_bytes(file_.read())
if self.vectors is not None: if self.vectors is not None:
self.vectors.from_disk(path, exclude='strings.json') self.vectors.from_disk(path, exclude='strings.json')
link_vectors_to_models(self)
return self return self
def to_bytes(self, **exclude): def to_bytes(self, **exclude):
@ -436,6 +438,7 @@ def unpickle_vocab(sstore, morphology, data_dir,
vocab.lex_attr_getters = lex_attr_getters vocab.lex_attr_getters = lex_attr_getters
vocab.lexemes_from_bytes(lexemes_data) vocab.lexemes_from_bytes(lexemes_data)
vocab.length = length vocab.length = length
link_vectors_to_models(vocab)
return vocab return vocab

View File

@ -17,6 +17,7 @@ fi
if [ "${VIA}" == "compile" ]; then if [ "${VIA}" == "compile" ]; then
pip install -r requirements.txt pip install -r requirements.txt
python setup.py build_ext --inplace
pip install -e . pip install -e .
fi fi

View File

@ -8,4 +8,5 @@ include _includes/_mixins
| does not exist! | does not exist!
h2.c-landing__title.u-heading-3.u-padding-small h2.c-landing__title.u-heading-3.u-padding-small
a(href="javascript:history.go(-1)") Click here to go back. +button(false, true, "secondary-light")(href="javascript:history.go(-1)")
| Click here to go back

View File

@ -3,24 +3,22 @@
"landing": true, "landing": true,
"logos": [ "logos": [
{ {
"quora": [ "https://www.quora.com", 150 ], "airbnb": [ "https://www.airbnb.com", 150, 45],
"chartbeat": [ "https://chartbeat.com", 200 ], "quora": [ "https://www.quora.com", 120, 34 ],
"duedil": [ "https://www.duedil.com", 150 ], "retriever": [ "https://www.retriever.no", 150, 33 ],
"stitchfix": [ "https://www.stitchfix.com", 190 ] "stitchfix": [ "https://www.stitchfix.com", 150, 18 ]
}, },
{ {
"wayblazer": [ "http://wayblazer.com", 200 ], "chartbeat": [ "https://chartbeat.com", 180, 25 ],
"indico": [ "https://indico.io", 150 ], "allenai": [ "https://allenai.org", 220, 37 ]
"chattermill": [ "https://chattermill.io", 175 ], }
"turi": [ "https://turi.com", 150 ], ],
"kip": [ "http://kipthis.com", 70 ] "features": [
},
{ {
"socrata": [ "https://www.socrata.com", 150 ], "thoughtworks": ["https://www.thoughtworks.com/radar/tools", 150, 28],
"cytora": [ "http://www.cytora.com", 125 ], "wapo": ["https://www.washingtonpost.com/news/wonk/wp/2016/05/18/googles-new-artificial-intelligence-cant-understand-these-sentences-can-you/", 100, 77],
"signaln": [ "http://signaln.com", 150 ], "venturebeat": ["https://venturebeat.com/2017/01/27/4-ai-startups-that-analyze-customer-reviews/", 150, 19],
"wonderflow": [ "http://www.wonderflow.co", 200 ], "microsoft": ["https://www.microsoft.com/developerblog/2016/09/13/training-a-classifier-for-relation-extraction-from-medical-literature/", 130, 28]
"synapsify": [ "http://www.gosynapsify.com", 150 ]
} }
] ]
}, },
@ -34,7 +32,24 @@
"landing": true "landing": true
}, },
"announcement" : { "styleguide": {
"title": "Important Announcement" "title": "Styleguide",
"sidebar": {
"Styleguide": { "": "styleguide" },
"Resources": {
"Website Source": "https://github.com/explosion/spacy/tree/master/website",
"Contributing Guide": "https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md"
}
},
"menu": {
"Introduction": "intro",
"Logo": "logo",
"Colors": "colors",
"Typography": "typography",
"Elements": "elements",
"Components": "components",
"Embeds": "embeds",
"Markup Reference": "markup"
}
} }
} }

View File

@ -11,12 +11,9 @@
"COMPANY": "Explosion AI", "COMPANY": "Explosion AI",
"COMPANY_URL": "https://explosion.ai", "COMPANY_URL": "https://explosion.ai",
"DEMOS_URL": "https://demos.explosion.ai", "DEMOS_URL": "https://demos.explosion.ai",
"MODELS_REPO": "explosion/spacy-models",
"SPACY_VERSION": "1.8", "SPACY_VERSION": "2.0",
"LATEST_NEWS": {
"url": "https://github.com/explosion/spaCy/releases/tag/v2.0.0-alpha",
"title": "Test spaCy v2.0.0 alpha!"
},
"SOCIAL": { "SOCIAL": {
"twitter": "spacy_io", "twitter": "spacy_io",
@ -27,25 +24,23 @@
}, },
"NAVIGATION": { "NAVIGATION": {
"Home": "/", "Usage": "/usage",
"Usage": "/docs/usage", "Models": "/models",
"Reference": "/docs/api", "API": "/api"
"Demos": "/docs/usage/showcase",
"Blog": "https://explosion.ai/blog"
}, },
"FOOTER": { "FOOTER": {
"spaCy": { "spaCy": {
"Usage": "/docs/usage", "Usage": "/usage",
"API Reference": "/docs/api", "Models": "/models",
"Tutorials": "/docs/usage/tutorials", "API Reference": "/api",
"Showcase": "/docs/usage/showcase" "Resources": "/usage/resources"
}, },
"Support": { "Support": {
"Issue Tracker": "https://github.com/explosion/spaCy/issues", "Issue Tracker": "https://github.com/explosion/spaCy/issues",
"StackOverflow": "http://stackoverflow.com/questions/tagged/spacy", "StackOverflow": "http://stackoverflow.com/questions/tagged/spacy",
"Reddit usergroup": "https://www.reddit.com/r/spacynlp/", "Reddit Usergroup": "https://www.reddit.com/r/spacynlp/",
"Gitter chat": "https://gitter.im/explosion/spaCy" "Gitter Chat": "https://gitter.im/explosion/spaCy"
}, },
"Connect": { "Connect": {
"Twitter": "https://twitter.com/spacy_io", "Twitter": "https://twitter.com/spacy_io",
@ -74,21 +69,11 @@
{"id": "venv", "title": "virtualenv", "help": "Use a virtual environment and install spaCy into a user directory" }, {"id": "venv", "title": "virtualenv", "help": "Use a virtual environment and install spaCy into a user directory" },
{"id": "gpu", "title": "GPU", "help": "Run spaCy on GPU to make it faster. Requires an NVDIA graphics card with CUDA 2+. See section below for more info."}] {"id": "gpu", "title": "GPU", "help": "Run spaCy on GPU to make it faster. Requires an NVDIA graphics card with CUDA 2+. See section below for more info."}]
}, },
{ "id": "model", "title": "Models", "multiple": true, "options": [ { "id": "model", "title": "Models", "multiple": true }
{ "id": "en", "title": "English", "meta": "50MB" },
{ "id": "de", "title": "German", "meta": "645MB" },
{ "id": "fr", "title": "French", "meta": "1.33GB" },
{ "id": "es", "title": "Spanish", "meta": "377MB"}]
}
], ],
"QUICKSTART_MODELS": [ "QUICKSTART_MODELS": [
{ "id": "lang", "title": "Language", "options": [ { "id": "lang", "title": "Language"},
{ "id": "en", "title": "English", "checked": true },
{ "id": "de", "title": "German" },
{ "id": "fr", "title": "French" },
{ "id": "es", "title": "Spanish" }]
},
{ "id": "load", "title": "Loading style", "options": [ { "id": "load", "title": "Loading style", "options": [
{ "id": "spacy", "title": "Use spacy.load()", "checked": true, "help": "Use spaCy's built-in loader to load the model by name." }, { "id": "spacy", "title": "Use spacy.load()", "checked": true, "help": "Use spaCy's built-in loader to load the model by name." },
{ "id": "module", "title": "Import as module", "help": "Import the model explicitly as a Python module." }] { "id": "module", "title": "Import as module", "help": "Import the model explicitly as a Python module." }]
@ -98,50 +83,15 @@
} }
], ],
"MODELS": {
"en": [
{ "id": "en_core_web_sm", "lang": "English", "feats": [1, 1, 1, 1], "size": "50 MB", "license": "CC BY-SA", "def": true },
{ "id": "en_core_web_md", "lang": "English", "feats": [1, 1, 1, 1], "size": "1 GB", "license": "CC BY-SA" },
{ "id": "en_depent_web_md", "lang": "English", "feats": [1, 1, 1, 0], "size": "328 MB", "license": "CC BY-SA" },
{ "id": "en_vectors_glove_md", "lang": "English", "feats": [1, 0, 0, 1], "size": "727 MB", "license": "CC BY-SA" }
],
"de": [
{ "id": "de_core_news_md", "lang": "German", "feats": [1, 1, 1, 1], "size": "645 MB", "license": "CC BY-SA" }
],
"fr": [
{ "id": "fr_depvec_web_lg", "lang": "French", "feats": [1, 1, 0, 1], "size": "1.33 GB", "license": "CC BY-NC" }
],
"es": [
{ "id": "es_core_web_md", "lang": "Spanish", "feats": [1, 1, 1, 1], "size": "377 MB", "license": "CC BY-SA"}
]
},
"EXAMPLE_SENTENCES": {
"en": "This is a sentence.",
"de": "Dies ist ein Satz.",
"fr": "C'est une phrase.",
"es": "Esto es una frase."
},
"ALPHA": true, "ALPHA": true,
"V_CSS": "1.6", "V_CSS": "2.0",
"V_JS": "1.2", "V_JS": "2.0",
"DEFAULT_SYNTAX": "python", "DEFAULT_SYNTAX": "python",
"ANALYTICS": "UA-58931649-1", "ANALYTICS": "UA-58931649-1",
"MAILCHIMP": { "MAILCHIMP": {
"user": "spacy.us12", "user": "spacy.us12",
"id": "83b0498b1e7fa3c91ce68c3f1", "id": "83b0498b1e7fa3c91ce68c3f1",
"list": "89ad33e698" "list": "89ad33e698"
},
"BADGES": {
"pipy": {
"badge": "https://img.shields.io/pypi/v/spacy.svg?style=flat-square",
"link": "https://pypi.python.org/pypi/spacy"
},
"conda": {
"badge": "https://anaconda.org/conda-forge/spacy/badges/version.svg",
"link": "https://anaconda.org/conda-forge/spacy"
}
} }
} }
} }

View File

@ -1,8 +1,6 @@
//- 💫 INCLUDES > FOOTER //- 💫 INCLUDES > FOOTER
include _mixins footer.o-footer.u-text
footer.o-footer.u-text.u-border-dotted
+grid.o-content +grid.o-content
each group, label in FOOTER each group, label in FOOTER
+grid-col("quarter") +grid-col("quarter")
@ -13,18 +11,18 @@ footer.o-footer.u-text.u-border-dotted
li li
+a(url)=item +a(url)=item
if SECTION != "docs" if SECTION == "index"
+grid-col("quarter") +grid-col("quarter")
include _newsletter include _newsletter
if SECTION == "docs" if SECTION != "index"
.o-content.o-block.u-border-dotted .o-content.o-block.u-border-dotted
include _newsletter include _newsletter
.o-inline-list.u-text-center.u-text-tiny.u-color-subtle .o-inline-list.u-text-center.u-text-tiny.u-color-subtle
span &copy; 2016-#{new Date().getFullYear()} #[+a(COMPANY_URL, true)=COMPANY] span &copy; 2016-#{new Date().getFullYear()} #[+a(COMPANY_URL, true)=COMPANY]
+a(COMPANY_URL, true) +a(COMPANY_URL, true)(aria-label="Explosion AI")
+svg("graphics", "explosion", 45).o-icon.u-color-theme.u-grayscale +icon("explosion", 45).o-icon.u-color-theme.u-grayscale
+a(COMPANY_URL + "/legal", true) Legal / Imprint +a(COMPANY_URL + "/legal", true) Legal / Imprint

View File

@ -1,35 +1,71 @@
//- 💫 INCLUDES > FUNCTIONS //- 💫 INCLUDES > FUNCTIONS
//- More descriptive variables for current.path and current.source //- Descriptive variables, available in the global scope
- CURRENT = current.source - CURRENT = current.source
- SECTION = current.path[0] - SECTION = current.path[0]
- SUBSECTION = current.path[1] - LANGUAGES = public.models._data.LANGUAGES
- MODELS = public.models._data.MODELS
- CURRENT_MODELS = MODELS[current.source] || []
- MODEL_COUNT = Object.keys(MODELS).map(m => Object.keys(MODELS[m]).length).reduce((a, b) => a + b)
- MODEL_LANG_COUNT = Object.keys(MODELS).length
- LANG_COUNT = Object.keys(LANGUAGES).length
- MODEL_META = public.models._data.MODEL_META
- MODEL_LICENSES = public.models._data.MODEL_LICENSES
- MODEL_ACCURACY = public.models._data.MODEL_ACCURACY
- EXAMPLE_SENTENCES = public.models._data.EXAMPLE_SENTENCES
- IS_PAGE = (SECTION != "index") && !landing
- IS_MODELS = (SECTION == "models" && LANGUAGES[current.source])
- HAS_MODELS = IS_MODELS && CURRENT_MODELS.length
//- Add prefixes to items of an array (for modifier CSS classes) //- Add prefixes to items of an array (for modifier CSS classes)
array - [array] list of class names or options, e.g. ["foot"]
prefix - [string] prefix to add to each class, e.g. "c-table__row"
RETURNS - [array] list of modified class names
- function prefixArgs(array, prefix) { - function prefixArgs(array, prefix) {
- return array.map(function(arg) { - return array.map(arg => prefix + '--' + arg).join(' ');
- return prefix + '--' + arg; - }
- }).join(' ');
//- Convert API paths (semi-temporary fix for renamed sections)
path - [string] link path supplied to +api mixin
RETURNS - [string] new link path to correct location
- function convertAPIPath(path) {
- if (path.startsWith('spacy#') || path.startsWith('displacy#') || path.startsWith('util#')) {
- var comps = path.split('#');
- return "top-level#" + comps[0] + '.' + comps[1];
- }
- else if (path.startsWith('cli#')) {
- return "top-level#" + path.split('#')[1];
- }
- return path;
- }
//- Get model components from ID. Components can then be looked up in LANGUAGES
and MODEL_META respectively, to get their human-readable form.
id - [string] model ID, e.g. "en_core_web_sm"
RETURNS - [object] object keyed by components lang, type, genre and size
- function getModelComponents(id) {
- var comps = id.split('_');
- return {'lang': comps[0], 'type': comps[1], 'genre': comps[2], 'size': comps[3]}
- } - }
//- Generate GitHub links //- Generate GitHub links
repo - [string] name of repo owned by explosion
filepath - [string] logical path to file relative to repository root
branch - [string] optional branch, defaults to "master"
RETURNS - [string] the correct link to the file on GitHub
- function gh(repo, filepath, branch) { - function gh(repo, filepath, branch) {
- var branch = ALPHA ? 'develop' : branch - var branch = ALPHA ? 'develop' : branch
- return 'https://github.com/' + SOCIAL.github + '/' + repo + (filepath ? '/blob/' + (branch || 'master') + '/' + filepath : '' ); - return 'https://github.com/' + SOCIAL.github + '/' + (repo || '') + (filepath ? '/blob/' + (branch || 'master') + '/' + filepath : '' );
- }
//- Get social images
- function getSocialImg() {
- var base = SITE_URL + '/assets/img/social/preview_'
- var image = ALPHA ? 'alpha' : 'default'
- if (preview) image = preview
- else if (SECTION == 'docs' && !ALPHA) image = 'docs'
- return base + image + '.jpg'
- } - }

View File

@ -1,5 +1,13 @@
//- 💫 MIXINS > BASE //- 💫 MIXINS > BASE
//- Section
id - [string] anchor assigned to section (used for breadcrumb navigation)
mixin section(id)
section.o-section(id="section-" + id data-section=id)
block
//- Aside wrapper //- Aside wrapper
label - [string] aside label label - [string] aside label
@ -11,34 +19,26 @@ mixin aside-wrapper(label)
block block
//- Date
input - [string] date in the format YYYY-MM-DD
mixin date(input) //- SVG from map (uses embedded SVG sprite)
- var date = new Date(input)
- var months = [ 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December' ]
time(datetime=JSON.parse(JSON.stringify(date)))&attributes(attributes)=months[date.getMonth()] + ' ' + date.getDate() + ', ' + date.getFullYear()
//- SVG from map
file - [string] SVG file name in /assets/img/
name - [string] SVG symbol id name - [string] SVG symbol id
width - [integer] width in px width - [integer] width in px
height - [integer] height in px (default: same as width) height - [integer] height in px (default: same as width)
mixin svg(file, name, width, height) mixin svg(name, width, height)
svg(aria-hidden="true" viewBox="0 0 #{width} #{height || width}" width=width height=(height || width))&attributes(attributes) svg(aria-hidden="true" viewBox="0 0 #{width} #{height || width}" width=width height=(height || width))&attributes(attributes)
use(xlink:href="/assets/img/#{file}.svg##{name}") use(xlink:href="#svg_#{name}")
//- Icon //- Icon
name - [string] icon name, should be SVG symbol ID name - [string] icon name (will be used as symbol id: #svg_{name})
size - [integer] icon width and height (default: 20) width - [integer] icon width (default: 20)
height - [integer] icon height (defaults to width)
mixin icon(name, size) mixin icon(name, width, height)
- var size = size || 20 - var width = width || 20
+svg("icons", name, size).o-icon(style="min-width: #{size}px")&attributes(attributes) - var height = height || width
+svg(name, width, height).o-icon(style="min-width: #{width}px")&attributes(attributes)
//- Pro/Con/Neutral icon //- Pro/Con/Neutral icon
@ -46,8 +46,8 @@ mixin icon(name, size)
size - [integer] icon size (optional) size - [integer] icon size (optional)
mixin procon(icon, size) mixin procon(icon, size)
- colors = { pro: "green", con: "red", neutral: "yellow" } - colors = { pro: "green", con: "red", neutral: "subtle" }
+icon(icon, size)(class="u-color-#{colors[icon] || 'subtle'}" aria-label=icon)&attributes(attributes) +icon("circle", size || 16)(class="u-color-#{colors[icon] || 'subtle'}" aria-label=icon)&attributes(attributes)
//- Headlines Helper Mixin //- Headlines Helper Mixin
@ -80,8 +80,7 @@ mixin headline(level)
mixin permalink(id) mixin permalink(id)
if id if id
a.u-permalink(id=id href="##{id}") a.u-permalink(href="##{id}")
+icon("anchor").u-permalink__icon
block block
else else
@ -109,7 +108,7 @@ mixin quickstart(groups, headline, description, hide_results)
.c-quickstart__fields .c-quickstart__fields
for option in group.options for option in group.options
input.c-quickstart__input(class="c-quickstart__input--" + (group.input_style ? group.input_style : group.multiple ? "check" : "radio") type=group.multiple ? "checkbox" : "radio" name=group.id id="qs-#{option.id}" value=option.id checked=option.checked) input.c-quickstart__input(class="c-quickstart__input--" + (group.input_style ? group.input_style : group.multiple ? "check" : "radio") type=group.multiple ? "checkbox" : "radio" name=group.id id="qs-#{option.id}" value=option.id checked=option.checked)
label.c-quickstart__label(for="qs-#{option.id}")!=option.title label.c-quickstart__label.u-text-tiny(for="qs-#{option.id}")!=option.title
if option.meta if option.meta
| #[span.c-quickstart__label__meta (#{option.meta})] | #[span.c-quickstart__label__meta (#{option.meta})]
if option.help if option.help
@ -122,12 +121,10 @@ mixin quickstart(groups, headline, description, hide_results)
code.c-code-block__content.c-quickstart__code(data-qs-results="") code.c-code-block__content.c-quickstart__code(data-qs-results="")
block block
.c-quickstart__info.u-text-tiny.o-block.u-text-right
| Like this widget? Check out #[+a("https://github.com/ines/quickstart").u-link quickstart.js]!
//- Quickstart code item //- Quickstart code item
data [object] - Rendering conditions (keyed by option group ID, value: option) data - [object] Rendering conditions (keyed by option group ID, value: option)
style - [string] modifier ID for line style
mixin qs(data, style) mixin qs(data, style)
- args = {} - args = {}
@ -148,6 +145,13 @@ mixin terminal(label)
+code.x-terminal__code +code.x-terminal__code
block block
//- Chart.js
id - [string] chart ID, will be assigned as #chart_{id}
mixin chart(id)
figure.o-block&attributes(attributes)
canvas(id="chart_#{id}" width="800" height="400" style="max-width: 100%")
//- Gitter chat button and widget //- Gitter chat button and widget
button - [string] text shown on button button - [string] text shown on button
@ -156,26 +160,24 @@ mixin terminal(label)
mixin gitter(button, label) mixin gitter(button, label)
aside.js-gitter.c-chat.is-collapsed(data-title=(label || button)) aside.js-gitter.c-chat.is-collapsed(data-title=(label || button))
button.js-gitter-button.c-chat__button.u-text-small button.js-gitter-button.c-chat__button.u-text-tag
+icon("chat").o-icon--inline +icon("chat", 16).o-icon--inline
!=button !=button
//- Badge //- Badge
name - [string] "pipy" or "conda" image - [string] path to badge image
url - [string] badge link
mixin badge(name) mixin badge(image, url)
- site = BADGES[name] +a(url).u-padding-small.u-hide-link&attributes(attributes)
img.o-badge(src=image alt=url height="20")
if site
+a(site.link).u-padding-small
img(src=site.badge alt="{name} version" height="20")
//- Logo //- spaCy logo
mixin logo() mixin logo()
+svg("graphics", "spacy", 675, 215).o-logo&attributes(attributes) +svg("spacy", 675, 215).o-logo&attributes(attributes)
//- Landing //- Landing
@ -186,18 +188,56 @@ mixin landing-header()
.c-landing__content .c-landing__content
block block
mixin landing-banner(headline, label)
.c-landing__banner.u-padding.o-block.u-color-light
+grid.c-landing__banner__content.o-no-block
+grid-col("third")
h3.u-heading.u-heading-1
if label
div
span.u-text-label.u-text-label--light=label
!=headline
mixin landing-badge(url, graphic, alt, size) +grid-col("two-thirds").c-landing__banner__text
+a(url)(aria-label=alt title=alt).c-landing__badge block
+svg("graphics", graphic, size || 225)
mixin landing-logos(title, logos)
.o-content.u-text-center&attributes(attributes)
h3.u-heading.u-text-label.u-color-dark=title
each row, i in logos
- var is_last = i == logos.length - 1
+grid("center").o-inline-list.o-no-block(class=is_last ? "o-no-block" : null)
each details, name in row
+a(details[0]).u-padding-medium
+icon(name, details[1], details[2])
if is_last
block
//- Under construction (temporary) //- Under construction (temporary)
Marks sections that still need to be completed for the v2.0 release. Marks sections that still need to be completed for the v2.0 release.
mixin under-construction() mixin under-construction()
+infobox("🚧 Under construction") +infobox("Under construction", "🚧")
| This section is still being written and will be updated for the v2.0 | This section is still being written and will be updated for the v2.0
| release. Is there anything that you think should definitely mentioned or | release. Is there anything that you think should definitely mentioned or
| explained here? Any examples you'd like to see? #[strong Let us know] | explained here? Any examples you'd like to see? #[strong Let us know]
| on the #[+a(gh("spacy") + "/issues/1105") v2.0 alpha thread] on GitHub! | on the #[+a(gh("spacy") + "/issues/1105") v2.0 alpha thread] on GitHub!
//- Alpha infobox (temporary)
Added in the templates to notify user that they're visiting the alpha site.
mixin alpha-info()
+infobox("You are viewing the spaCy v2.0.0 alpha docs", "⚠️")
strong This page is part of the alpha documentation for spaCy v2.0.
| It does not reflect the state of the latest stable release.
| Because v2.0 is still under development, the implementation
| may differ from the intended state described here. See the
| #[+a(gh("spaCy") + "/releases/tag/v2.0.0-alpha") release notes]
| for details on how to install and test the new version. To
| read the official docs for spaCy v1.x,
| #[+a("https://spacy.io/docs") go here].

View File

@ -8,11 +8,15 @@ include _mixins-base
level - [integer] headline level, corresponds to h1, h2, h3 etc. level - [integer] headline level, corresponds to h1, h2, h3 etc.
id - [string] unique identifier, creates permalink (optional) id - [string] unique identifier, creates permalink (optional)
mixin h(level, id) mixin h(level, id, source)
+headline(level).u-heading&attributes(attributes) +headline(level).u-heading(id=id)&attributes(attributes)
+permalink(id) +permalink(id)
block block
if source
+button(gh("spacy", source), false, "secondary", "small").u-nowrap.u-float-right
span Source #[+icon("code", 14).o-icon--inline]
//- External links //- External links
url - [string] link href url - [string] link href
@ -38,21 +42,23 @@ mixin src(url)
//- API link (with added tag and automatically generated path) //- API link (with added tag and automatically generated path)
path - [string] path to API docs page relative to /docs/api/ path - [string] path to API docs page relative to /api/
mixin api(path) mixin api(path)
+a("/docs/api/" + path, true)(target="_self").u-no-border.u-inline-block.u-nowrap - path = convertAPIPath(path)
+a("/api/" + path, true)(target="_self").u-no-border.u-inline-block.u-nowrap
block block
| #[+icon("book", 18).o-icon--inline.u-color-theme] | #[+icon("book", 16).o-icon--inline.u-color-theme]
//- Help icon with tooltip //- Help icon with tooltip
tooltip - [string] Tooltip text tooltip - [string] Tooltip text
icon_size - [integer] Optional size of help icon in px.
mixin help(tooltip) mixin help(tooltip, icon_size)
span(data-tooltip=tooltip)&attributes(attributes) span(data-tooltip=tooltip)&attributes(attributes)
+icon("help", 16).i-icon--inline +icon("help", icon_size || 16).o-icon--inline
//- Aside for text //- Aside for text
@ -68,24 +74,43 @@ mixin aside(label)
label - [string] aside title (optional or false for no label) label - [string] aside title (optional or false for no label)
language - [string] language for syntax highlighting (default: "python") language - [string] language for syntax highlighting (default: "python")
supports basic relevant languages available for PrismJS supports basic relevant languages available for PrismJS
prompt - [string] prompt displayed before first line, e.g. "$"
mixin aside-code(label, language) mixin aside-code(label, language, prompt)
+aside-wrapper(label) +aside-wrapper(label)
+code(false, language).o-no-block +code(false, language, prompt).o-no-block
block block
//- Infobox //- Infobox
label - [string] infobox title (optional or false for no title) label - [string] infobox title (optional or false for no title)
emoji - [string] optional emoji displayed before the title, necessary as
argument to be able to wrap it for spacing
mixin infobox(label) mixin infobox(label, emoji)
aside.o-box.o-block.u-text-small aside.o-box.o-block.u-text-small
if label if label
h3.u-text-label.u-color-theme=label h3.u-heading.u-text-label.u-color-theme
if emoji
span.o-emoji=emoji
| #{label}
block block
//- Logos displayed in the top corner of some infoboxes
logos - [array] List of icon ID, width, height and link.
mixin infobox-logos(...logos)
.o-box__logos.u-text-right.u-float-right
for logo in logos
if logo[3]
| #[+a(logo[3]).u-inline-block.u-hide-link.u-padding-small #[+icon(logo[0], logo[1], logo[2]).u-color-dark]]
else
| #[+icon(logo[0], logo[1], logo[2]).u-color-dark]
//- Link button //- Link button
url - [string] link href url - [string] link href
trusted - [boolean] if not set / false, rel="noopener nofollow" is added trusted - [boolean] if not set / false, rel="noopener nofollow" is added
@ -94,7 +119,7 @@ mixin infobox(label)
see assets/css/_components/_buttons.sass see assets/css/_components/_buttons.sass
mixin button(url, trusted, ...style) mixin button(url, trusted, ...style)
- external = url.includes("http") - external = url && url.includes("http")
a.c-button.u-text-label(href=url class=prefixArgs(style, "c-button") role="button" target=external ? "_blank" : null rel=external && !trusted ? "noopener nofollow" : null)&attributes(attributes) a.c-button.u-text-label(href=url class=prefixArgs(style, "c-button") role="button" target=external ? "_blank" : null rel=external && !trusted ? "noopener nofollow" : null)&attributes(attributes)
block block
@ -103,31 +128,33 @@ mixin button(url, trusted, ...style)
label - [string] aside title (optional or false for no label) label - [string] aside title (optional or false for no label)
language - [string] language for syntax highlighting (default: "python") language - [string] language for syntax highlighting (default: "python")
supports basic relevant languages available for PrismJS supports basic relevant languages available for PrismJS
prompt - [string] prompt or icon to display next to code block, (mostly used for old/new) prompt - [string] prompt displayed before first line, e.g. "$"
height - [integer] optional height to clip code block to height - [integer] optional height to clip code block to
icon - [string] icon displayed next to code block (e.g. "accept" for new code)
wrap - [boolean] wrap text and disable horizontal scrolling
mixin code(label, language, prompt, height) mixin code(label, language, prompt, height, icon, wrap)
pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : null style=height ? "height: #{height}px" : null)&attributes(attributes) pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : null style=height ? "height: #{height}px" : null)&attributes(attributes)
if label if label
h4.u-text-label.u-text-label--dark=label h4.u-text-label.u-text-label--dark=label
- var icon = (prompt == 'accept' || prompt == 'reject') - var icon = icon || (prompt == 'accept' || prompt == 'reject')
if icon if icon
- var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'} - var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'}
.c-code-block__icon(class=classes[icon] || null class=classes[icon] ? "c-code-block__icon--border" : null) .c-code-block__icon(class=classes[icon] || null class=classes[icon] ? "c-code-block__icon--border" : null)
+icon(icon, 18) +icon(icon, 18)
code.c-code-block__content(data-prompt=icon ? null : prompt) code.c-code-block__content(class=wrap ? "u-wrap" : null data-prompt=icon ? null : prompt)
block block
//- Code blocks to display old/new versions //- Code blocks to display old/new versions
mixin code-old() mixin code-old()
+code(false, false, "reject").o-block-small +code(false, false, false, false, "reject").o-block-small
block block
mixin code-new() mixin code-new()
+code(false, false, "accept").o-block-small +code(false, false, false, false, "accept").o-block-small
block block
@ -138,12 +165,33 @@ mixin code-new()
mixin codepen(slug, height, default_tab) mixin codepen(slug, height, default_tab)
figure.o-block(style="min-height: #{height}px")&attributes(attributes) figure.o-block(style="min-height: #{height}px")&attributes(attributes)
.codepen(data-height=height data-theme-id="26467" data-slug-hash=slug data-default-tab=(default_tab || "result") data-embed-version="2" data-user=SOCIAL.codepen) .codepen(data-height=height data-theme-id="31335" data-slug-hash=slug data-default-tab=(default_tab || "result") data-embed-version="2" data-user=SOCIAL.codepen)
+a("https://codepen.io/" + SOCIAL.codepen + "/" + slug) View on CodePen +a("https://codepen.io/" + SOCIAL.codepen + "/" + slug) View on CodePen
script(async src="https://assets.codepen.io/assets/embed/ei.js") script(async src="https://assets.codepen.io/assets/embed/ei.js")
//- GitHub embed
repo - [string] repository owned by explosion organization
file - [string] logical path to file, relative to repository root
alt_file - [string] alternative file path used in footer and link button
height - [integer] height of code preview in px
mixin github(repo, file, alt_file, height)
- var branch = ALPHA ? "develop" : "master"
- var height = height || 250
figure.o-block
pre.c-code-block.o-block-small(class="lang-#{(language || DEFAULT_SYNTAX)}" style="height: #{height}px; min-height: #{height}px")
code.c-code-block__content(data-gh-embed="#{repo}/#{branch}/#{file}")
footer.o-grid.u-text
.o-block-small.u-flex-full #[+icon("github")] #[code=repo + '/' + (alt_file || file)]
div
+button(gh(repo, alt_file || file), false, "primary", "small") View on GitHub
//- Images / figures //- Images / figures
url - [string] url or path to image url - [string] url or path to image
width - [integer] image width in px, for better rendering (default: 500) width - [integer] image width in px, for better rendering (default: 500)
@ -168,10 +216,26 @@ mixin image-caption()
block block
//- Label //- Graphic or illustration with button
original - [string] Path to original image
mixin graphic(original)
+image
block
if original
.u-text-right
+button(original, false, "secondary", "small") View large graphic
//- Labels
mixin label() mixin label()
.u-text-label.u-color-subtle&attributes(attributes) .u-text-label.u-color-dark&attributes(attributes)
block
mixin label-inline()
strong.u-text-label.u-color-dark&attributes(attributes)
block block
@ -188,8 +252,10 @@ mixin tag()
mixin tag-model(...capabs) mixin tag-model(...capabs)
- var intro = "To use this functionality, spaCy needs a model to be installed" - var intro = "To use this functionality, spaCy needs a model to be installed"
- var ext = capabs.length ? " that supports the following capabilities: " + capabs.join(', ') : "" - var ext = capabs.length ? " that supports the following capabilities: " + capabs.join(', ') : ""
+tag Requires model
+help(intro + ext + ".").u-color-theme span.u-nowrap
+tag Needs model
+help(intro + ext + ".").u-color-theme
//- "New" tag to label features new in a specific version //- "New" tag to label features new in a specific version
@ -219,15 +285,9 @@ mixin list(type, start)
//- List item (only used within +list) //- List item (only used within +list)
mixin item(procon) mixin item()
if procon li.c-list__item&attributes(attributes)
li&attributes(attributes) block
+procon(procon).c-list__icon
block
else
li.c-list__item&attributes(attributes)
block
//- Table //- Table
@ -237,9 +297,9 @@ mixin table(head)
table.c-table.o-block&attributes(attributes) table.c-table.o-block&attributes(attributes)
if head if head
+row +row("head")
each column in head each column in head
th.c-table__head-cell.u-text-label=column +head-cell=column
block block
@ -251,10 +311,11 @@ mixin row(...style)
block block
//- Footer table row (only ued within +table)
mixin footrow() //- Header table cell (only used within +row)
tr.c-table__row.c-table__row--foot&attributes(attributes)
mixin head-cell()
th.c-table__head-cell.u-text-label&attributes(attributes)
block block
@ -284,71 +345,58 @@ mixin grid-col(width)
//- Card (only used within +grid) //- Card (only used within +grid)
title - [string] card title title - [string] card title
details - [object] url, image, author, description, tags etc. url - [string] link for card
(see /docs/usage/_data.json) author - [string] optional author, displayed as byline at the bottom
icon - [string] optional ID of icon displayed with card
width - [string] optional width of grid column, defaults to "half"
mixin card(title, details) mixin card(title, url, author, icon, width)
+grid-col("half").o-card.u-text&attributes(attributes) +grid-col(width || "half").o-box.o-grid.o-grid--space.u-text&attributes(attributes)
if details.image +a(url)
+a(details.url).o-block-small h4.u-heading.u-text-label
img(src=details.image alt=title width="300" role="presentation") if icon
+icon(icon, 25).u-float-right
if title if title
+a(details.url) span.u-color-dark=title
+h(3)=title .o-block-small.u-text-small
block
if details.author if author
.u-text-small.u-color-subtle by #{details.author} .u-color-subtle.u-text-tiny by #{author}
if details.description || details.tags
ul
if details.description
li=details.description
if details.tags
li
each tag in details.tags
span.u-text-tag #{tag}
| &nbsp;
block
//- Simpler card list item (only used within +list) //- Table of contents, to be used with +item mixins for links
title - [string] card title col - [string] width of column (see +grid-col)
details - [object] url, image, author, description, tags etc.
(see /docs/usage/_data.json)
mixin card-item(title, details) mixin table-of-contents(col)
+item&attributes(attributes) +grid-col(col || "half")
+a(details.url)=title +infobox
+label.o-block-small Table of contents
if details.description +list("numbers").u-text-small.o-no-block
br block
span=details.description
if details.author
br
span.u-text-small.u-color-subtle by #{details.author}
//- Table row for models table //- Bibliography
id - [string] ID of bibliography component, for anchor links. Can be used if
there's more than one bibliography on one page.
mixin model-row(name, lang, procon, size, license, default_model, divider) mixin bibliography(id)
- var licenses = { "CC BY-SA": "https://creativecommons.org/licenses/by-sa/3.0/", "CC BY-NC": "https://creativecommons.org/licenses/by-nc/3.0/" } section(id=id || "bibliography")
+infobox
+label.o-block-small Bibliography
+list("numbers").u-text-small.o-no-block
block
+row(divider ? "divider": null)
+cell #[code=name] //- Footnote
if default_model id - [string / integer] ID of footnote.
| #[span.u-color-theme(title="default model") #[+icon("star", 16)]] bib_id - [string] ID of bibliography component, defaults to "bibliography".
+cell=lang tooltip - [string] optional text displayed as tooltip
each icon in procon
+cell.u-text-center #[+procon(icon ? "pro" : "con")] mixin fn(id, bib_id, tooltip)
+cell.u-text-right=size sup.u-padding-small(id="bib" + id data-tooltip=tooltip)
+cell span.u-text-tag
if license in licenses +a("#" + (bib_id || "bibliography")).u-hide-link #{id}
+a(licenses[license])=license
//- Table rows for annotation specs //- Table rows for annotation specs
@ -383,14 +431,3 @@ mixin annotation-row(annots, style)
else else
+cell=cell +cell=cell
block block
//- Table of contents, to be used with +item mixins for links
col - [string] width of column (see +grid-col)
mixin table-of-contents(col)
+grid-col(col || "half")
+infobox
+label.o-block-small Table of contents
+list("numbers").u-text-small.o-no-block
block

View File

@ -1,19 +1,15 @@
//- 💫 INCLUDES > TOP NAVIGATION //- 💫 INCLUDES > TOP NAVIGATION
include _mixins
nav.c-nav.u-text.js-nav(class=landing ? "c-nav--theme" : null) nav.c-nav.u-text.js-nav(class=landing ? "c-nav--theme" : null)
a(href='/') #[+logo] a(href="/" aria-label=SITENAME) #[+logo]
if SUBSECTION != "index"
.u-text-label.u-padding-small.u-hidden-xs=SUBSECTION
ul.c-nav__menu ul.c-nav__menu
- var NAV = ALPHA ? { "Usage": "/docs/usage", "Reference": "/docs/api" } : NAVIGATION - var current_url = '/' + current.path[0]
each url, item in NAVIGATION
each url, item in NAV li.c-nav__menu__item(class=(current_url == url) ? "is-active" : null)
li.c-nav__menu__item(class=(url == "/") ? "u-hidden-xs" : null)
+a(url)=item +a(url)=item
li.c-nav__menu__item li.c-nav__menu__item.u-hidden-xs
+a(gh("spaCy"))(aria-label="GitHub").u-hidden-xs #[+icon("github", 20)] +a(gh("spaCy"))(aria-label="GitHub") #[+icon("github", 20)]
progress.c-progress.js-progress(value="0" max="1")

View File

@ -1,6 +1,6 @@
//- 💫 INCLUDES > NEWSLETTER //- 💫 INCLUDES > NEWSLETTER
ul.o-block ul.o-block-small
li.u-text-label.u-color-subtle Stay in the loop! li.u-text-label.u-color-subtle Stay in the loop!
li Receive updates about new releases, tutorials and more. li Receive updates about new releases, tutorials and more.
@ -10,7 +10,6 @@ form.o-grid#mc-embedded-subscribe-form(action="//#{MAILCHIMP.user}.list-manage.c
div(style="position: absolute; left: -5000px;" aria-hidden="true") div(style="position: absolute; left: -5000px;" aria-hidden="true")
input(type="text" name="b_#{MAILCHIMP.id}_#{MAILCHIMP.list}" tabindex="-1" value="") input(type="text" name="b_#{MAILCHIMP.id}_#{MAILCHIMP.list}" tabindex="-1" value="")
.o-grid-col.u-border.u-padding-small .o-grid-col.o-grid.o-grid--nowrap.o-field.u-padding-small
input#mce-EMAIL.u-text(type="email" name="EMAIL" placeholder="Your email") input#mce-EMAIL.o-field__input.u-text(type="email" name="EMAIL" placeholder="Your email" aria-label="Your email")
button#mc-embedded-subscribe.o-field__button.u-text-label.u-color-theme.u-nowrap(type="submit" name="subscribe") Sign up
button#mc-embedded-subscribe.u-text-label.u-color-theme(type="submit" name="subscribe") Sign up

View File

@ -1,47 +1,56 @@
//- 💫 INCLUDES > DOCS PAGE TEMPLATE //- 💫 INCLUDES > DOCS PAGE TEMPLATE
- sidebar_content = (SUBSECTION != "index") ? public.docs[SUBSECTION]._data.sidebar : public.docs._data.sidebar || FOOTER - sidebar_content = (public[SECTION] ? public[SECTION]._data.sidebar : public._data[SECTION] ? public._data[SECTION].sidebar : false) || FOOTER
include _sidebar include _sidebar
main.o-main.o-main--sidebar.o-main--aside main.o-main.o-main--sidebar.o-main--aside
article.o-content article.o-content
+grid.o-no-block +grid.o-no-block
+grid-col(source ? "two-thirds" : "full") +h(1).u-heading--title=title.replace("'", "")
+h(1)=title if tag
if tag +tag=tag
+tag=tag if tag_new
+tag-new(tag_new)
if teaser
.u-heading__teaser.u-text-small.u-color-dark=teaser
else if IS_MODELS
.u-heading__teaser.u-text-small.u-color-dark
| Available statistical models for
| #[code=current.source] (#{LANGUAGES[current.source]}).
if source if source
+grid-col("third").u-text-right .o-block.u-text-right
.o-inline-list +button(gh("spacy", source), false, "secondary", "small").u-nowrap
+button(gh("spacy", source), false, "secondary").u-text-tag Source #[+icon("code", 14)] | Source #[+icon("code", 14)]
//-if ALPHA
//- +alpha-info
if ALPHA if IS_MODELS
+infobox("⚠️ You are viewing the spaCy v2.0.0 alpha docs") include _page_models
strong This page is part of the alpha documentation for spaCy v2.0. else
| It does not reflect the state of the latest stable release. !=yield
| Because v2.0 is still under development, the implementation
| may differ from the intended state described here. See the
| #[+a(gh("spaCy") + "/releases/tag/v2.0.0-alpha") release notes]
| for details on how to install and test the new version. To
| read the official docs for spaCy v1.x,
| #[+a("https://spacy.io/docs") go here].
!=yield
+grid.o-content.u-text +grid.o-content.u-text
+grid-col("half") +grid-col("half")
if next && public.docs[SUBSECTION]._data[next] if !IS_MODELS
- data = public.docs[SUBSECTION]._data[next]
.o-inline-list .o-inline-list
span #[strong.u-text-label Read next:] #[+a(next).u-link=data.title] +button(gh("spacy", "website/" + current.path.join('/') + ".jade"), false, "secondary", "small")
| #[span.o-icon Suggest edits] #[+icon("code", 14)]
+grid-col("half").u-text-right +grid-col("half").u-text-right
.o-inline-list if next && public[SECTION]._data[next]
+button(gh("spacy", "website/" + current.path.join('/') + ".jade"), false, "secondary").u-text-tag Suggest edits #[+icon("code", 14)] - data = public[SECTION]._data[next]
+grid("vcenter")
+a(next).u-text-small.u-flex-full
h4.u-text-label.u-color-dark Read next
| #{data.title}
+a(next).c-icon-button.c-icon-button--right(aria-hidden="true")
+icon("arrow-right", 24)
+gitter("spaCy chat") +gitter("spaCy chat")

View File

@ -0,0 +1,77 @@
//- 💫 INCLUDES > MODELS PAGE TEMPLATE
for id in CURRENT_MODELS
+section(id)
+grid("vcenter").o-no-block(id=id)
+grid-col("two-thirds")
+h(2)
+a("#" + id).u-permalink=id
+grid-col("third").u-text-right
.u-color-subtle.u-text-tiny
+button(gh("spacy-models") + "/releases", true, "secondary", "small")(data-tpl=id data-tpl-key="download")
| Release details
.u-padding-small Latest: #[code(data-tpl=id data-tpl-key="version") n/a]
+aside-code("Installation", "bash", "$").
spacy download #{id}
- var comps = getModelComponents(id)
p(data-tpl=id data-tpl-key="description")
div(data-tpl=id data-tpl-key="error" style="display: none")
+infobox
| Unable to load model details from GitHub. To find out more
| about this model, see the overview of the
| #[+a(gh("spacy-models") + "/releases") latest model releases].
+table(data-tpl=id data-tpl-key="table")
+row
+cell #[+label Language]
+cell #[+tag=comps.lang] #{LANGUAGES[comps.lang]}
for comp, label in {"Type": comps.type, "Genre": comps.genre}
+row
+cell #[+label=label]
+cell #[+tag=comp] #{MODEL_META[comp]}
+row
+cell #[+label Size]
+cell #[+tag=comps.size] #[span(data-tpl=id data-tpl-key="size") #[em n/a]]
each label in ["Pipeline", "Sources", "Author", "License"]
- var field = label.toLowerCase()
+row
+cell.u-nowrap
+label=label
if MODEL_META[field]
| #[+help(MODEL_META[field]).u-color-subtle]
+cell
span(data-tpl=id data-tpl-key=field) #[em n/a]
+row(data-tpl=id data-tpl-key="compat-wrapper" style="display: none")
+cell
+label Compat #[+help("Latest compatible model version for your spaCy installation").u-color-subtle]
+cell
.o-field.u-float-left
select.o-field__select.u-text-small(data-tpl=id data-tpl-key="compat")
.o-empty(data-tpl=id data-tpl-key="compat-versions") &nbsp;
section(data-tpl=id data-tpl-key="accuracy-wrapper" style="display: none")
+grid.o-no-block
+grid-col("third")
+h(4) Accuracy
+table.o-block-small
for label, field in MODEL_ACCURACY
+row(style="display: none")
+cell.u-nowrap
+label=label
if MODEL_META[field]
| #[+help(MODEL_META[field]).u-color-subtle]
+cell.u-text-right(data-tpl=id data-tpl-key=field)
| n/a
+grid-col("two-thirds")
+h(4) Comparison
+chart(id).u-padding-small
p.u-text-small.u-color-dark(data-tpl=id data-tpl-key="notes")

View File

@ -1,27 +1,46 @@
//- 💫 INCLUDES > SCRIPTS //- 💫 INCLUDES > SCRIPTS
script(src="/assets/js/main.js?v#{V_JS}") if quickstart
script(src="/assets/js/prism.js") script(src="/assets/js/quickstart.min.js")
if SECTION == "docs" if IS_PAGE
if quickstart script(src="/assets/js/in-view.min.js")
script(src="/assets/js/quickstart.js")
script var qs = new Quickstart("#qs")
script. if HAS_MODELS
((window.gitter = {}).chat = {}).options = { script(src="/assets/js/chart.min.js")
useStyles: false,
activationElement: '.js-gitter-button',
targetElement: '.js-gitter',
room: '!{SOCIAL.gitter}'
};
script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer)
if environment == "deploy" if environment == "deploy"
script script(async src="https://www.google-analytics.com/analytics.js")
script(src="/assets/js/prism.min.js")
script(src="/assets/js/main.js?v#{V_JS}")
script
| new ProgressBar('.js-progress');
if changelog
| new Changelog('!{SOCIAL.github}', 'spacy');
if quickstart
| new Quickstart("#qs");
if IS_PAGE
| new SectionHighlighter('data-section', 'data-nav');
| new GitHubEmbed('!{SOCIAL.github}', 'data-gh-embed');
| ((window.gitter = {}).chat = {}).options = {
| useStyles: false,
| activationElement: '.js-gitter-button',
| targetElement: '.js-gitter',
| room: '!{SOCIAL.gitter}'
| };
if HAS_MODELS
| new ModelLoader('!{MODELS_REPO}', !{JSON.stringify(CURRENT_MODELS)}, !{JSON.stringify(MODEL_LICENSES)}, !{JSON.stringify(MODEL_ACCURACY)});
if environment == "deploy"
| window.ga=window.ga||function(){ | window.ga=window.ga||function(){
| (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date; | (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date;
| ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview'); | ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview');
script(async src="https://www.google-analytics.com/analytics.js") if IS_PAGE
script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer)

View File

@ -1,13 +1,23 @@
//- 💫 INCLUDES > SIDEBAR //- 💫 INCLUDES > SIDEBAR
include _mixins
menu.c-sidebar.js-sidebar.u-text menu.c-sidebar.js-sidebar.u-text
if sidebar_content if sidebar_content
each items, menu in sidebar_content each items, sectiontitle in sidebar_content
ul.c-sidebar__section.o-block ul.c-sidebar__section.o-block-small
li.u-text-label.u-color-subtle=menu li.u-text-label.u-color-dark=sectiontitle
each url, item in items each url, item in items
li(class=(CURRENT == url || (CURRENT == "index" && url == "./")) ? "is-active" : null) - var is_current = CURRENT == url || (CURRENT == "index" && url == "./")
+a(url)=item li.c-sidebar__item
+a(url)(class=is_current ? "is-active" : null)=item
if is_current
if IS_MODELS && CURRENT_MODELS.length
- menu = Object.assign({}, ...CURRENT_MODELS.map(id => ({ [id]: id })))
if menu
ul.c-sidebar__crumb.u-hidden-sm
- var counter = 0
for id, title in menu
- counter++
li.c-sidebar__crumb__item(data-nav=id class=(counter == 1) ? "is-active" : null)
+a("#section-" + id)=title

157
website/_includes/_svg.jade Normal file

File diff suppressed because one or more lines are too long

View File

@ -2,11 +2,16 @@
include _includes/_mixins include _includes/_mixins
- title = IS_MODELS ? LANGUAGES[current.source] || title : title
- social_title = (SECTION == "index") ? SITENAME + " - " + SLOGAN : title + " - " + SITENAME
- social_img = SITE_URL + "/assets/img/social/preview_" + (preview || ALPHA ? "alpha" : "default") + ".jpg"
doctype html doctype html
html(lang="en") html(lang="en")
title title
if SECTION == "docs" && SUBSECTION && SUBSECTION != "index" if SECTION == "api" || SECTION == "usage" || SECTION == "models"
| #{title} | #{SITENAME} #{SUBSECTION == "api" ? "API" : "Usage"} Documentation - var title_section = (SECTION == "api") ? "API" : SECTION.charAt(0).toUpperCase() + SECTION.slice(1)
| #{title} | #{SITENAME} #{title_section} Documentation
else if SECTION != "index" else if SECTION != "index"
| #{title} | #{SITENAME} | #{title} | #{SITENAME}
@ -22,32 +27,30 @@ html(lang="en")
meta(property="og:type" content="website") meta(property="og:type" content="website")
meta(property="og:site_name" content=sitename) meta(property="og:site_name" content=sitename)
meta(property="og:url" content="#{SITE_URL}/#{current.path.join('/')}") meta(property="og:url" content="#{SITE_URL}/#{current.path.join('/')}")
meta(property="og:title" content="#{title} - spaCy") meta(property="og:title" content=social_title)
meta(property="og:description" content=description) meta(property="og:description" content=description)
meta(property="og:image" content=getSocialImg()) meta(property="og:image" content=social_img)
meta(name="twitter:card" content="summary_large_image") meta(name="twitter:card" content="summary_large_image")
meta(name="twitter:site" content="@" + SOCIAL.twitter) meta(name="twitter:site" content="@" + SOCIAL.twitter)
meta(name="twitter:title" content="#{title} - spaCy") meta(name="twitter:title" content=social_title)
meta(name="twitter:description" content=description) meta(name="twitter:description" content=description)
meta(name="twitter:image" content=getSocialImg()) meta(name="twitter:image" content=social_img)
link(rel="shortcut icon" href="/assets/img/favicon.ico") link(rel="shortcut icon" href="/assets/img/favicon.ico")
link(rel="icon" type="image/x-icon" href="/assets/img/favicon.ico") link(rel="icon" type="image/x-icon" href="/assets/img/favicon.ico")
if ALPHA && SECTION == "docs" if SECTION == "api"
link(href="/assets/css/style_green.css?v#{V_CSS}" rel="stylesheet") link(href="/assets/css/style_green.css?v#{V_CSS}" rel="stylesheet")
else if SUBSECTION == "usage"
link(href="/assets/css/style_red.css?v#{V_CSS}" rel="stylesheet")
else else
link(href="/assets/css/style.css?v#{V_CSS}" rel="stylesheet") link(href="/assets/css/style.css?v#{V_CSS}" rel="stylesheet")
body body
include _includes/_svg
include _includes/_navigation include _includes/_navigation
if SECTION == "docs" if !landing
include _includes/_page-docs include _includes/_page-docs
else else

View File

@ -0,0 +1,43 @@
//- 💫 DOCS > API > ANNOTATION > BILUO
+table([ "Tag", "Description" ])
+row
+cell #[code #[span.u-color-theme B] EGIN]
+cell The first token of a multi-token entity.
+row
+cell #[code #[span.u-color-theme I] N]
+cell An inner token of a multi-token entity.
+row
+cell #[code #[span.u-color-theme L] AST]
+cell The final token of a multi-token entity.
+row
+cell #[code #[span.u-color-theme U] NIT]
+cell A single-token entity.
+row
+cell #[code #[span.u-color-theme O] UT]
+cell A non-entity token.
+aside("Why BILUO, not IOB?")
| There are several coding schemes for encoding entity annotations as
| token tags. These coding schemes are equally expressive, but not
| necessarily equally learnable.
| #[+a("http://www.aclweb.org/anthology/W09-1119") Ratinov and Roth]
| showed that the minimal #[strong Begin], #[strong In], #[strong Out]
| scheme was more difficult to learn than the #[strong BILUO] scheme that
| we use, which explicitly marks boundary tokens.
p
| spaCy translates the character offsets into this scheme, in order to
| decide the cost of each action given the current state of the entity
| recogniser. The costs are then used to calculate the gradient of the
| loss, to train the model. The exact algorithm is a pastiche of
| well-known methods, and is not currently described in any single
| publication. The model is a greedy transition-based parser guided by a
| linear model whose weights are learned using the averaged perceptron
| loss, via the #[+a("http://www.aclweb.org/anthology/C12-1059") dynamic oracle]
| imitation learning strategy. The transition system is equivalent to the
| BILOU tagging scheme.

View File

@ -0,0 +1,115 @@
//- 💫 DOCS > API > ARCHITECTURE > CYTHON
+aside("What's Cython?")
| #[+a("http://cython.org/") Cython] is a language for writing
| C extensions for Python. Most Python code is also valid Cython, but
| you can add type declarations to get efficient memory-managed code
| just like C or C++.
p
| spaCy's core data structures are implemented as
| #[+a("http://cython.org/") Cython] #[code cdef] classes. Memory is
| managed through the #[+a(gh("cymem")) #[code cymem]]
| #[code cymem.Pool] class, which allows you
| to allocate memory which will be freed when the #[code Pool] object
| is garbage collected. This means you usually don't have to worry
| about freeing memory. You just have to decide which Python object
| owns the memory, and make it own the #[code Pool]. When that object
| goes out of scope, the memory will be freed. You do have to take
| care that no pointers outlive the object that owns them — but this
| is generally quite easy.
p
| All Cython modules should have the #[code # cython: infer_types=True]
| compiler directive at the top of the file. This makes the code much
| cleaner, as it avoids the need for many type declarations. If
| possible, you should prefer to declare your functions #[code nogil],
| even if you don't especially care about multi-threading. The reason
| is that #[code nogil] functions help the Cython compiler reason about
| your code quite a lot — you're telling the compiler that no Python
| dynamics are possible. This lets many errors be raised, and ensures
| your function will run at C speed.
p
| Cython gives you many choices of sequences: you could have a Python
| list, a numpy array, a memory view, a C++ vector, or a pointer.
| Pointers are preferred, because they are fastest, have the most
| explicit semantics, and let the compiler check your code more
| strictly. C++ vectors are also great — but you should only use them
| internally in functions. It's less friendly to accept a vector as an
| argument, because that asks the user to do much more work. Here's
| how to get a pointer from a numpy array, memory view or vector:
+code.
cdef void get_pointers(np.ndarray[int, mode='c'] numpy_array, vector[int] cpp_vector, int[::1] memory_view) nogil:
pointer1 = &lt;int*&gt;numpy_array.data
pointer2 = cpp_vector.data()
pointer3 = &memory_view[0]
p
| Both C arrays and C++ vectors reassure the compiler that no Python
| operations are possible on your variable. This is a big advantage:
| it lets the Cython compiler raise many more errors for you.
p
| When getting a pointer from a numpy array or memoryview, take care
| that the data is actually stored in C-contiguous order — otherwise
| you'll get a pointer to nonsense. The type-declarations in the code
| above should generate runtime errors if buffers with incorrect
| memory layouts are passed in. To iterate over the array, the
| following style is preferred:
+code.
cdef int c_total(const int* int_array, int length) nogil:
total = 0
for item in int_array[:length]:
total += item
return total
p
| If this is confusing, consider that the compiler couldn't deal with
| #[code for item in int_array:] — there's no length attached to a raw
| pointer, so how could we figure out where to stop? The length is
| provided in the slice notation as a solution to this. Note that we
| don't have to declare the type of #[code item] in the code above —
| the compiler can easily infer it. This gives us tidy code that looks
| quite like Python, but is exactly as fast as C — because we've made
| sure the compilation to C is trivial.
p
| Your functions cannot be declared #[code nogil] if they need to
| create Python objects or call Python functions. This is perfectly
| okay — you shouldn't torture your code just to get #[code nogil]
| functions. However, if your function isn't #[code nogil], you should
| compile your module with #[code cython -a --cplus my_module.pyx] and
| open the resulting #[code my_module.html] file in a browser. This
| will let you see how Cython is compiling your code. Calls into the
| Python run-time will be in bright yellow. This lets you easily see
| whether Cython is able to correctly type your code, or whether there
| are unexpected problems.
p
| Working in Cython is very rewarding once you're over the initial
| learning curve. As with C and C++, the first way you write something
| in Cython will often be the performance-optimal approach. In
| contrast, Python optimisation generally requires a lot of
| experimentation. Is it faster to have an #[code if item in my_dict]
| check, or to use #[code .get()]? What about
| #[code try]/#[code except]? Does this numpy operation create a copy?
| There's no way to guess the answers to these questions, and you'll
| usually be dissatisfied with your results — so there's no way to
| know when to stop this process. In the worst case, you'll make a
| mess that invites the next reader to try their luck too. This is
| like one of those
| #[+a("http://www.wemjournal.org/article/S1080-6032%2809%2970088-2/abstract") volcanic gas-traps],
| where the rescuers keep passing out from low oxygen, causing
| another rescuer to follow — only to succumb themselves. In short,
| just say no to optimizing your Python. If it's not fast enough the
| first time, just switch to Cython.
+infobox("Resources")
+list.o-no-block
+item #[+a("http://docs.cython.org/en/latest/") Official Cython documentation] (cython.org)
+item #[+a("https://explosion.ai/blog/writing-c-in-cython", true) Writing C in Cython] (explosion.ai)
+item #[+a("https://explosion.ai/blog/multithreading-with-cython") Multi-threading spaCys parser and named entity recogniser] (explosion.ai)

View File

@ -0,0 +1,141 @@
//- 💫 DOCS > API > ARCHITECTURE > NN MODEL ARCHITECTURE
p
| The parsing model is a blend of recent results. The two recent
| inspirations have been the work of Eli Klipperwasser and Yoav Goldberg at
| Bar Ilan#[+fn(1)], and the SyntaxNet team from Google. The foundation of
| the parser is still based on the work of Joakim Nivre#[+fn(2)], who
| introduced the transition-based framework#[+fn(3)], the arc-eager
| transition system, and the imitation learning objective. The model is
| implemented using #[+a(gh("thinc")) Thinc], spaCy's machine learning
| library. We first predict context-sensitive vectors for each word in the
| input:
+code.
(embed_lower | embed_prefix | embed_suffix | embed_shape)
&gt;&gt; Maxout(token_width)
&gt;&gt; convolution ** 4
p
| This convolutional layer is shared between the tagger, parser and NER,
| and will also be shared by the future neural lemmatizer. Because the
| parser shares these layers with the tagger, the parser does not require
| tag features. I got this trick from David Weiss's "Stack Combination"
| paper#[+fn(4)].
p
| To boost the representation, the tagger actually predicts a "super tag"
| with POS, morphology and dependency label#[+fn(5)]. The tagger predicts
| these supertags by adding a softmax layer onto the convolutional layer
| so, we're teaching the convolutional layer to give us a representation
| that's one affine transform from this informative lexical information.
| This is obviously good for the parser (which backprops to the
| convolutions too). The parser model makes a state vector by concatenating
| the vector representations for its context tokens. The current context
| tokens:
+table
+row
+cell #[code S0], #[code S1], #[code S2]
+cell Top three words on the stack.
+row
+cell #[code B0], #[code B1]
+cell First two words of the buffer.
+row
+cell.u-nowrap
| #[code S0L1], #[code S1L1], #[code S2L1], #[code B0L1],
| #[code B1L1]#[br]
| #[code S0L2], #[code S1L2], #[code S2L2], #[code B0L2],
| #[code B1L2]
+cell
| Leftmost and second leftmost children of #[code S0], #[code S1],
| #[code S2], #[code B0] and #[code B1].
+row
+cell.u-nowrap
| #[code S0R1], #[code S1R1], #[code S2R1], #[code B0R1],
| #[code B1R1]#[br]
| #[code S0R2], #[code S1R2], #[code S2R2], #[code B0R2],
| #[code B1R2]
+cell
| Rightmost and second rightmost children of #[code S0], #[code S1],
| #[code S2], #[code B0] and #[code B1].
p
| This makes the state vector quite long: #[code 13*T], where #[code T] is
| the token vector width (128 is working well). Fortunately, there's a way
| to structure the computation to save some expense (and make it more
| GPU-friendly).
p
| The parser typically visits #[code 2*N] states for a sentence of length
| #[code N] (although it may visit more, if it back-tracks with a
| non-monotonic transition#[+fn(4)]). A naive implementation would require
| #[code 2*N (B, 13*T) @ (13*T, H)] matrix multiplications for a batch of
| size #[code B]. We can instead perform one #[code (B*N, T) @ (T, 13*H)]
| multiplication, to pre-compute the hidden weights for each positional
| feature with respect to the words in the batch. (Note that our token
| vectors come from the CNN — so we can't play this trick over the
| vocabulary. That's how Stanford's NN parser#[+fn(3)] works — and why its
| model is so big.)
p
| This pre-computation strategy allows a nice compromise between
| GPU-friendliness and implementation simplicity. The CNN and the wide
| lower layer are computed on the GPU, and then the precomputed hidden
| weights are moved to the CPU, before we start the transition-based
| parsing process. This makes a lot of things much easier. We don't have to
| worry about variable-length batch sizes, and we don't have to implement
| the dynamic oracle in CUDA to train.
p
| Currently the parser's loss function is multilabel log loss#[+fn(6)], as
| the dynamic oracle allows multiple states to be 0 cost. This is defined
| as follows, where #[code gZ] is the sum of the scores assigned to gold
| classes:
+code.
(exp(score) / Z) - (exp(score) / gZ)
+bibliography
+item
| #[+a("https://www.semanticscholar.org/paper/Simple-and-Accurate-Dependency-Parsing-Using-Bidir-Kiperwasser-Goldberg/3cf31ecb2724b5088783d7c96a5fc0d5604cbf41") Simple and Accurate Dependency Parsing Using Bidirectional LSTM Feature Representations]
br
| Eliyahu Kiperwasser, Yoav Goldberg. (2016)
+item
| #[+a("https://www.semanticscholar.org/paper/A-Dynamic-Oracle-for-Arc-Eager-Dependency-Parsing-Goldberg-Nivre/22697256ec19ecc3e14fcfc63624a44cf9c22df4") A Dynamic Oracle for Arc-Eager Dependency Parsing]
br
| Yoav Goldberg, Joakim Nivre (2012)
+item
| #[+a("https://explosion.ai/blog/parsing-english-in-python") Parsing English in 500 Lines of Python]
br
| Matthew Honnibal (2013)
+item
| #[+a("https://www.semanticscholar.org/paper/Stack-propagation-Improved-Representation-Learning-Zhang-Weiss/0c133f79b23e8c680891d2e49a66f0e3d37f1466") Stack-propagation: Improved Representation Learning for Syntax]
br
| Yuan Zhang, David Weiss (2016)
+item
| #[+a("https://www.semanticscholar.org/paper/Deep-multi-task-learning-with-low-level-tasks-supe-S%C3%B8gaard-Goldberg/03ad06583c9721855ccd82c3d969a01360218d86") Deep multi-task learning with low level tasks supervised at lower layers]
br
| Anders Søgaard, Yoav Goldberg (2016)
+item
| #[+a("https://www.semanticscholar.org/paper/An-Improved-Non-monotonic-Transition-System-for-De-Honnibal-Johnson/4094cee47ade13b77b5ab4d2e6cb9dd2b8a2917c") An Improved Non-monotonic Transition System for Dependency Parsing]
br
| Matthew Honnibal, Mark Johnson (2015)
+item
| #[+a("http://cs.stanford.edu/people/danqi/papers/emnlp2014.pdf") A Fast and Accurate Dependency Parser using Neural Networks]
br
| Danqi Cheng, Christopher D. Manning (2014)
+item
| #[+a("https://www.semanticscholar.org/paper/Parsing-the-Wall-Street-Journal-using-a-Lexical-Fu-Riezler-King/0ad07862a91cd59b7eb5de38267e47725a62b8b2") Parsing the Wall Street Journal using a Lexical-Functional Grammar and Discriminative Estimation Techniques]
br
| Stefan Riezler et al. (2002)

View File

@ -1,29 +1,32 @@
{ {
"sidebar": { "sidebar": {
"Introduction": { "Overview": {
"Facts & Figures": "./", "Architecture": "./",
"Languages": "language-models", "Annotation Specs": "annotation",
"Annotation Specs": "annotation" "Functions": "top-level"
}, },
"Top-level": { "Containers": {
"spacy": "spacy",
"displacy": "displacy",
"Utility Functions": "util",
"Command line": "cli"
},
"Classes": {
"Doc": "doc", "Doc": "doc",
"Token": "token", "Token": "token",
"Span": "span", "Span": "span",
"Lexeme": "lexeme"
},
"Pipeline": {
"Language": "language", "Language": "language",
"Tokenizer": "tokenizer", "Pipe": "pipe",
"Tensorizer": "tensorizer", "Tensorizer": "tensorizer",
"Tagger": "tagger", "Tagger": "tagger",
"DependencyParser": "dependencyparser", "DependencyParser": "dependencyparser",
"EntityRecognizer": "entityrecognizer", "EntityRecognizer": "entityrecognizer",
"TextCategorizer": "textcategorizer", "TextCategorizer": "textcategorizer",
"Tokenizer": "tokenizer",
"Lemmatizer": "lemmatizer",
"Matcher": "matcher", "Matcher": "matcher",
"Lexeme": "lexeme", "PhraseMatcher": "phrasematcher"
},
"Other": {
"Vocab": "vocab", "Vocab": "vocab",
"StringStore": "stringstore", "StringStore": "stringstore",
"Vectors": "vectors", "Vectors": "vectors",
@ -34,52 +37,37 @@
}, },
"index": { "index": {
"title": "Facts & Figures", "title": "Architecture",
"next": "language-models" "next": "annotation",
"menu": {
"Basics": "basics",
"Neural Network Model": "nn-model",
"Cython Conventions": "cython"
}
}, },
"language-models": { "top-level": {
"title": "Languages", "title": "Top-level Functions",
"next": "philosophy" "menu": {
}, "spacy": "spacy",
"displacy": "displacy",
"philosophy": { "Utility Functions": "util",
"title": "Philosophy" "Compatibility": "compat",
}, "Command Line": "cli"
}
"spacy": {
"title": "spaCy top-level functions",
"source": "spacy/__init__.py",
"next": "displacy"
},
"displacy": {
"title": "displaCy",
"tag": "module",
"source": "spacy/displacy",
"next": "util"
},
"util": {
"title": "Utility Functions",
"source": "spacy/util.py",
"next": "cli"
},
"cli": {
"title": "Command Line Interface",
"source": "spacy/cli"
}, },
"language": { "language": {
"title": "Language", "title": "Language",
"tag": "class", "tag": "class",
"teaser": "A text-processing pipeline.",
"source": "spacy/language.py" "source": "spacy/language.py"
}, },
"doc": { "doc": {
"title": "Doc", "title": "Doc",
"tag": "class", "tag": "class",
"teaser": "A container for accessing linguistic annotations.",
"source": "spacy/tokens/doc.pyx" "source": "spacy/tokens/doc.pyx"
}, },
@ -103,6 +91,7 @@
"vocab": { "vocab": {
"title": "Vocab", "title": "Vocab",
"teaser": "A storage class for vocabulary and other data shared across a language.",
"tag": "class", "tag": "class",
"source": "spacy/vocab.pyx" "source": "spacy/vocab.pyx"
}, },
@ -115,10 +104,27 @@
"matcher": { "matcher": {
"title": "Matcher", "title": "Matcher",
"teaser": "Match sequences of tokens, based on pattern rules.",
"tag": "class", "tag": "class",
"source": "spacy/matcher.pyx" "source": "spacy/matcher.pyx"
}, },
"phrasematcher": {
"title": "PhraseMatcher",
"teaser": "Match sequences of tokens, based on documents.",
"tag": "class",
"tag_new": 2,
"source": "spacy/matcher.pyx"
},
"pipe": {
"title": "Pipe",
"teaser": "Abstract base class defining the API for pipeline components.",
"tag": "class",
"tag_new": 2,
"source": "spacy/pipeline.pyx"
},
"dependenyparser": { "dependenyparser": {
"title": "DependencyParser", "title": "DependencyParser",
"tag": "class", "tag": "class",
@ -127,18 +133,22 @@
"entityrecognizer": { "entityrecognizer": {
"title": "EntityRecognizer", "title": "EntityRecognizer",
"teaser": "Annotate named entities on documents.",
"tag": "class", "tag": "class",
"source": "spacy/pipeline.pyx" "source": "spacy/pipeline.pyx"
}, },
"textcategorizer": { "textcategorizer": {
"title": "TextCategorizer", "title": "TextCategorizer",
"teaser": "Add text categorization models to spaCy pipelines.",
"tag": "class", "tag": "class",
"tag_new": 2,
"source": "spacy/pipeline.pyx" "source": "spacy/pipeline.pyx"
}, },
"dependencyparser": { "dependencyparser": {
"title": "DependencyParser", "title": "DependencyParser",
"teaser": "Annotate syntactic dependencies on documents.",
"tag": "class", "tag": "class",
"source": "spacy/pipeline.pyx" "source": "spacy/pipeline.pyx"
}, },
@ -149,15 +159,23 @@
"source": "spacy/tokenizer.pyx" "source": "spacy/tokenizer.pyx"
}, },
"lemmatizer": {
"title": "Lemmatizer",
"tag": "class"
},
"tagger": { "tagger": {
"title": "Tagger", "title": "Tagger",
"teaser": "Annotate part-of-speech tags on documents.",
"tag": "class", "tag": "class",
"source": "spacy/pipeline.pyx" "source": "spacy/pipeline.pyx"
}, },
"tensorizer": { "tensorizer": {
"title": "Tensorizer", "title": "Tensorizer",
"teaser": "Add a tensor with position-sensitive meaning representations to a document.",
"tag": "class", "tag": "class",
"tag_new": 2,
"source": "spacy/pipeline.pyx" "source": "spacy/pipeline.pyx"
}, },
@ -169,23 +187,38 @@
"goldcorpus": { "goldcorpus": {
"title": "GoldCorpus", "title": "GoldCorpus",
"teaser": "An annotated corpus, using the JSON file format.",
"tag": "class", "tag": "class",
"tag_new": 2,
"source": "spacy/gold.pyx" "source": "spacy/gold.pyx"
}, },
"binder": { "binder": {
"title": "Binder", "title": "Binder",
"tag": "class", "tag": "class",
"tag_new": 2,
"source": "spacy/tokens/binder.pyx" "source": "spacy/tokens/binder.pyx"
}, },
"vectors": { "vectors": {
"title": "Vectors", "title": "Vectors",
"teaser": "Store, save and load word vectors.",
"tag": "class", "tag": "class",
"tag_new": 2,
"source": "spacy/vectors.pyx" "source": "spacy/vectors.pyx"
}, },
"annotation": { "annotation": {
"title": "Annotation Specifications" "title": "Annotation Specifications",
"teaser": "Schemes used for labels, tags and training data.",
"menu": {
"Tokenization": "tokenization",
"Sentence Boundaries": "sbd",
"POS Tagging": "pos-tagging",
"Lemmatization": "lemmatization",
"Dependencies": "dependency-parsing",
"Named Entities": "named-entities",
"Training Data": "training"
}
} }
} }

View File

@ -1,26 +1,17 @@
//- 💫 DOCS > USAGE > COMMAND LINE INTERFACE //- 💫 DOCS > API > TOP-LEVEL > COMMAND LINE INTERFACE
include ../../_includes/_mixins
p p
| As of v1.7.0, spaCy comes with new command line helpers to download and | As of v1.7.0, spaCy comes with new command line helpers to download and
| link models and show useful debugging information. For a list of available | link models and show useful debugging information. For a list of available
| commands, type #[code spacy --help]. | commands, type #[code spacy --help].
+infobox("⚠️ Deprecation note") +h(3, "download") Download
| As of spaCy 2.0, the #[code model] command to initialise a model data
| directory is deprecated. The command was only necessary because previous
| versions of spaCy expected a model directory to already be set up. This
| has since been changed, so you can use the #[+api("cli#train") #[code train]]
| command straight away.
+h(2, "download") Download
p p
| Download #[+a("/docs/usage/models") models] for spaCy. The downloader finds the | Download #[+a("/usage/models") models] for spaCy. The downloader finds the
| best-matching compatible version, uses pip to download the model as a | best-matching compatible version, uses pip to download the model as a
| package and automatically creates a | package and automatically creates a
| #[+a("/docs/usage/models#usage") shortcut link] to load the model by name. | #[+a("/usage/models#usage") shortcut link] to load the model by name.
| Direct downloads don't perform any compatibility checks and require the | Direct downloads don't perform any compatibility checks and require the
| model name to be specified with its version (e.g., #[code en_core_web_sm-1.2.0]). | model name to be specified with its version (e.g., #[code en_core_web_sm-1.2.0]).
@ -49,15 +40,15 @@ p
| detailed messages in case things go wrong. It's #[strong not recommended] | detailed messages in case things go wrong. It's #[strong not recommended]
| to use this command as part of an automated process. If you know which | to use this command as part of an automated process. If you know which
| model your project needs, you should consider a | model your project needs, you should consider a
| #[+a("/docs/usage/models#download-pip") direct download via pip], or | #[+a("/usage/models#download-pip") direct download via pip], or
| uploading the model to a local PyPi installation and fetching it straight | uploading the model to a local PyPi installation and fetching it straight
| from there. This will also allow you to add it as a versioned package | from there. This will also allow you to add it as a versioned package
| dependency to your project. | dependency to your project.
+h(2, "link") Link +h(3, "link") Link
p p
| Create a #[+a("/docs/usage/models#usage") shortcut link] for a model, | Create a #[+a("/usage/models#usage") shortcut link] for a model,
| either a Python package or a local directory. This will let you load | either a Python package or a local directory. This will let you load
| models from any location using a custom name via | models from any location using a custom name via
| #[+api("spacy#load") #[code spacy.load()]]. | #[+api("spacy#load") #[code spacy.load()]].
@ -95,7 +86,7 @@ p
+cell flag +cell flag
+cell Show help message and available arguments. +cell Show help message and available arguments.
+h(2, "info") Info +h(3, "info") Info
p p
| Print information about your spaCy installation, models and local setup, | Print information about your spaCy installation, models and local setup,
@ -122,15 +113,15 @@ p
+cell flag +cell flag
+cell Show help message and available arguments. +cell Show help message and available arguments.
+h(2, "convert") Convert +h(3, "convert") Convert
p p
| Convert files into spaCy's #[+a("/docs/api/annotation#json-input") JSON format] | Convert files into spaCy's #[+a("/api/annotation#json-input") JSON format]
| for use with the #[code train] command and other experiment management | for use with the #[code train] command and other experiment management
| functions. The right converter is chosen based on the file extension of | functions. The right converter is chosen based on the file extension of
| the input file. Currently only supports #[code .conllu]. | the input file. Currently only supports #[code .conllu].
+code(false, "bash", "$"). +code(false, "bash", "$", false, false, true).
spacy convert [input_file] [output_dir] [--n-sents] [--morphology] spacy convert [input_file] [output_dir] [--n-sents] [--morphology]
+table(["Argument", "Type", "Description"]) +table(["Argument", "Type", "Description"])
@ -159,14 +150,18 @@ p
+cell flag +cell flag
+cell Show help message and available arguments. +cell Show help message and available arguments.
+h(2, "train") Train +h(3, "train") Train
p p
| Train a model. Expects data in spaCy's | Train a model. Expects data in spaCy's
| #[+a("/docs/api/annotation#json-input") JSON format]. | #[+a("/api/annotation#json-input") JSON format]. On each epoch, a model
| will be saved out to the directory. Accuracy scores and model details
| will be added to a #[+a("/usage/training#models-generating") #[code meta.json]]
| to allow packaging the model using the
| #[+api("cli#package") #[code package]] command.
+code(false, "bash", "$"). +code(false, "bash", "$", false, false, true).
spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities] spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--meta-path] [--vectors] [--no-tagger] [--no-parser] [--no-entities] [--gold-preproc]
+table(["Argument", "Type", "Description"]) +table(["Argument", "Type", "Description"])
+row +row
@ -204,6 +199,27 @@ p
+cell option +cell option
+cell Use GPU. +cell Use GPU.
+row
+cell #[code --vectors], #[code -v]
+cell option
+cell Model to load vectors from.
+row
+cell #[code --meta-path], #[code -m]
+cell option
+cell
| #[+tag-new(2)] Optional path to model
| #[+a("/usage/training#models-generating") #[code meta.json]].
| All relevant properties like #[code lang], #[code pipeline] and
| #[code spacy_version] will be overwritten.
+row
+cell #[code --version], #[code -V]
+cell option
+cell
| Model version. Will be written out to the model's
| #[code meta.json] after training.
+row +row
+cell #[code --no-tagger], #[code -T] +cell #[code --no-tagger], #[code -T]
+cell flag +cell flag
@ -219,12 +235,18 @@ p
+cell flag +cell flag
+cell Don't train NER. +cell Don't train NER.
+row
+cell #[code --gold-preproc], #[code -G]
+cell flag
+cell Use gold preprocessing.
+row +row
+cell #[code --help], #[code -h] +cell #[code --help], #[code -h]
+cell flag +cell flag
+cell Show help message and available arguments. +cell Show help message and available arguments.
+h(3, "train-hyperparams") Environment variables for hyperparameters +h(4, "train-hyperparams") Environment variables for hyperparameters
+tag-new(2)
p p
| spaCy lets you set hyperparameters for training via environment variables. | spaCy lets you set hyperparameters for training via environment variables.
@ -236,98 +258,149 @@ p
+code(false, "bash"). +code(false, "bash").
parser_hidden_depth=2 parser_maxout_pieces=1 train-parser parser_hidden_depth=2 parser_maxout_pieces=1 train-parser
+under-construction
+table(["Name", "Description", "Default"]) +table(["Name", "Description", "Default"])
+row +row
+cell #[code dropout_from] +cell #[code dropout_from]
+cell +cell Initial dropout rate.
+cell #[code 0.2] +cell #[code 0.2]
+row +row
+cell #[code dropout_to] +cell #[code dropout_to]
+cell +cell Final dropout rate.
+cell #[code 0.2] +cell #[code 0.2]
+row +row
+cell #[code dropout_decay] +cell #[code dropout_decay]
+cell +cell Rate of dropout change.
+cell #[code 0.0] +cell #[code 0.0]
+row +row
+cell #[code batch_from] +cell #[code batch_from]
+cell +cell Initial batch size.
+cell #[code 1] +cell #[code 1]
+row +row
+cell #[code batch_to] +cell #[code batch_to]
+cell +cell Final batch size.
+cell #[code 64] +cell #[code 64]
+row +row
+cell #[code batch_compound] +cell #[code batch_compound]
+cell +cell Rate of batch size acceleration.
+cell #[code 1.001] +cell #[code 1.001]
+row +row
+cell #[code token_vector_width] +cell #[code token_vector_width]
+cell +cell Width of embedding tables and convolutional layers.
+cell #[code 128] +cell #[code 128]
+row +row
+cell #[code embed_size] +cell #[code embed_size]
+cell +cell Number of rows in embedding tables.
+cell #[code 7500] +cell #[code 7500]
+row +row
+cell #[code parser_maxout_pieces] +cell #[code parser_maxout_pieces]
+cell +cell Number of pieces in the parser's and NER's first maxout layer.
+cell #[code 2] +cell #[code 2]
+row +row
+cell #[code parser_hidden_depth] +cell #[code parser_hidden_depth]
+cell +cell Number of hidden layers in the parser and NER.
+cell #[code 1] +cell #[code 1]
+row +row
+cell #[code hidden_width] +cell #[code hidden_width]
+cell +cell Size of the parser's and NER's hidden layers.
+cell #[code 128] +cell #[code 128]
+row +row
+cell #[code learn_rate] +cell #[code learn_rate]
+cell +cell Learning rate.
+cell #[code 0.001] +cell #[code 0.001]
+row +row
+cell #[code optimizer_B1] +cell #[code optimizer_B1]
+cell +cell Momentum for the Adam solver.
+cell #[code 0.9] +cell #[code 0.9]
+row +row
+cell #[code optimizer_B2] +cell #[code optimizer_B2]
+cell +cell Adagrad-momentum for the Adam solver.
+cell #[code 0.999] +cell #[code 0.999]
+row +row
+cell #[code optimizer_eps] +cell #[code optimizer_eps]
+cell +cell Epsylon value for the Adam solver.
+cell #[code 1e-08] +cell #[code 1e-08]
+row +row
+cell #[code L2_penalty] +cell #[code L2_penalty]
+cell +cell L2 regularisation penalty.
+cell #[code 1e-06] +cell #[code 1e-06]
+row +row
+cell #[code grad_norm_clip] +cell #[code grad_norm_clip]
+cell +cell Gradient L2 norm constraint.
+cell #[code 1.0] +cell #[code 1.0]
+h(2, "package") Package +h(3, "evaluate") Evaluate
+tag-new(2)
p p
| Generate a #[+a("/docs/usage/saving-loading#generating") model Python package] | Evaluate a model's accuracy and speed on JSON-formatted annotated data.
| Will print the results and optionally export
| #[+a("/usage/visualizers") displaCy visualizations] of a sample set of
| parses to #[code .html] files. Visualizations for the dependency parse
| and NER will be exported as separate files if the respective component
| is present in the model's pipeline.
+code(false, "bash", "$", false, false, true).
spacy evaluate [model] [data_path] [--displacy-path] [--displacy-limit] [--gpu-id] [--gold-preproc]
+table(["Argument", "Type", "Description"])
+row
+cell #[code model]
+cell positional
+cell
| Model to evaluate. Can be a package or shortcut link name, or a
| path to a model data directory.
+row
+cell #[code data_path]
+cell positional
+cell Location of JSON-formatted evaluation data.
+row
+cell #[code --displacy-path], #[code -dp]
+cell option
+cell
| Directory to output rendered parses as HTML. If not set, no
| visualizations will be generated.
+row
+cell #[code --displacy-limit], #[code -dl]
+cell option
+cell
| Number of parses to generate per file. Defaults to #[code 25].
| Keep in mind that a significantly higher number might cause the
| #[code .html] files to render slowly.
+row
+cell #[code --gpu-id], #[code -g]
+cell option
+cell GPU to use, if any. Defaults to #[code -1] for CPU.
+row
+cell #[code --gold-preproc], #[code -G]
+cell flag
+cell Use gold preprocessing.
+h(3, "package") Package
p
| Generate a #[+a("/usage/training#models-generating") model Python package]
| from an existing model data directory. All data files are copied over. | from an existing model data directory. All data files are copied over.
| If the path to a meta.json is supplied, or a meta.json is found in the | If the path to a meta.json is supplied, or a meta.json is found in the
| input directory, this file is used. Otherwise, the data can be entered | input directory, this file is used. Otherwise, the data can be entered
@ -336,8 +409,8 @@ p
| sure you're always using the latest versions. This means you need to be | sure you're always using the latest versions. This means you need to be
| connected to the internet to use this command. | connected to the internet to use this command.
+code(false, "bash", "$"). +code(false, "bash", "$", false, false, true).
spacy package [input_dir] [output_dir] [--meta] [--force] spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] [--force]
+table(["Argument", "Type", "Description"]) +table(["Argument", "Type", "Description"])
+row +row
@ -353,14 +426,14 @@ p
+row +row
+cell #[code --meta-path], #[code -m] +cell #[code --meta-path], #[code -m]
+cell option +cell option
+cell Path to meta.json file (optional). +cell #[+tag-new(2)] Path to meta.json file (optional).
+row +row
+cell #[code --create-meta], #[code -c] +cell #[code --create-meta], #[code -c]
+cell flag +cell flag
+cell +cell
| Create a meta.json file on the command line, even if one already | #[+tag-new(2)] Create a meta.json file on the command line, even
| exists in the directory. | if one already exists in the directory.
+row +row
+cell #[code --force], #[code -f] +cell #[code --force], #[code -f]

View File

@ -0,0 +1,91 @@
//- 💫 DOCS > API > TOP-LEVEL > COMPATIBILITY
p
| All Python code is written in an
| #[strong intersection of Python 2 and Python 3]. This is easy in Cython,
| but somewhat ugly in Python. Logic that deals with Python or platform
| compatibility only lives in #[code spacy.compat]. To distinguish them from
| the builtin functions, replacement functions are suffixed with an
| undersocre, e.e #[code unicode_]. For specific checks, spaCy uses the
| #[code six] and #[code ftfy] packages.
+aside-code("Example").
from spacy.compat import unicode_, json_dumps
compatible_unicode = unicode_('hello world')
compatible_json = json_dumps({'key': 'value'})
+table(["Name", "Python 2", "Python 3"])
+row
+cell #[code compat.bytes_]
+cell #[code str]
+cell #[code bytes]
+row
+cell #[code compat.unicode_]
+cell #[code unicode]
+cell #[code str]
+row
+cell #[code compat.basestring_]
+cell #[code basestring]
+cell #[code str]
+row
+cell #[code compat.input_]
+cell #[code raw_input]
+cell #[code input]
+row
+cell #[code compat.json_dumps]
+cell #[code ujson.dumps] with #[code .decode('utf8')]
+cell #[code ujson.dumps]
+row
+cell #[code compat.path2str]
+cell #[code str(path)] with #[code .decode('utf8')]
+cell #[code str(path)]
+h(3, "is_config") compat.is_config
+tag function
p
| Check if a specific configuration of Python version and operating system
| matches the user's setup. Mostly used to display targeted error messages.
+aside-code("Example").
from spacy.compat import is_config
if is_config(python2=True, windows=True):
print("You are using Python 2 on Windows.")
+table(["Name", "Type", "Description"])
+row
+cell #[code python2]
+cell bool
+cell spaCy is executed with Python 2.x.
+row
+cell #[code python3]
+cell bool
+cell spaCy is executed with Python 3.x.
+row
+cell #[code windows]
+cell bool
+cell spaCy is executed on Windows.
+row
+cell #[code linux]
+cell bool
+cell spaCy is executed on Linux.
+row
+cell #[code osx]
+cell bool
+cell spaCy is executed on OS X or macOS.
+row("foot")
+cell returns
+cell bool
+cell Whether the specified configuration matches the user's platform.

View File

@ -1,14 +1,12 @@
//- 💫 DOCS > API > DISPLACY //- 💫 DOCS > API > TOP-LEVEL > DISPLACY
include ../../_includes/_mixins
p p
| As of v2.0, spaCy comes with a built-in visualization suite. For more | As of v2.0, spaCy comes with a built-in visualization suite. For more
| info and examples, see the usage guide on | info and examples, see the usage guide on
| #[+a("/docs/usage/visualizers") visualizing spaCy]. | #[+a("/usage/visualizers") visualizing spaCy].
+h(2, "serve") displacy.serve +h(3, "displacy.serve") displacy.serve
+tag method +tag method
+tag-new(2) +tag-new(2)
@ -60,7 +58,7 @@ p
+cell bool +cell bool
+cell +cell
| Don't parse #[code Doc] and instead, expect a dict or list of | Don't parse #[code Doc] and instead, expect a dict or list of
| dicts. #[+a("/docs/usage/visualizers#manual-usage") See here] | dicts. #[+a("/usage/visualizers#manual-usage") See here]
| for formats and examples. | for formats and examples.
+cell #[code False] +cell #[code False]
@ -70,7 +68,7 @@ p
+cell Port to serve visualization. +cell Port to serve visualization.
+cell #[code 5000] +cell #[code 5000]
+h(2, "render") displacy.render +h(3, "displacy.render") displacy.render
+tag method +tag method
+tag-new(2) +tag-new(2)
@ -127,24 +125,24 @@ p Render a dependency parse tree or named entity visualization.
+cell bool +cell bool
+cell +cell
| Don't parse #[code Doc] and instead, expect a dict or list of | Don't parse #[code Doc] and instead, expect a dict or list of
| dicts. #[+a("/docs/usage/visualizers#manual-usage") See here] | dicts. #[+a("/usage/visualizers#manual-usage") See here]
| for formats and examples. | for formats and examples.
+cell #[code False] +cell #[code False]
+footrow +row("foot")
+cell returns +cell returns
+cell unicode +cell unicode
+cell Rendered HTML markup. +cell Rendered HTML markup.
+cell +cell
+h(2, "options") Visualizer options +h(3, "displacy_options") Visualizer options
p p
| The #[code options] argument lets you specify additional settings for | The #[code options] argument lets you specify additional settings for
| each visualizer. If a setting is not present in the options, the default | each visualizer. If a setting is not present in the options, the default
| value will be used. | value will be used.
+h(3, "options-dep") Dependency Visualizer options +h(4, "options-dep") Dependency Visualizer options
+aside-code("Example"). +aside-code("Example").
options = {'compact': True, 'color': 'blue'} options = {'compact': True, 'color': 'blue'}
@ -219,7 +217,7 @@ p
+cell Distance between words in px. +cell Distance between words in px.
+cell #[code 175] / #[code 85] (compact) +cell #[code 175] / #[code 85] (compact)
+h(3, "options-ent") Named Entity Visualizer options +h(4, "displacy_options-ent") Named Entity Visualizer options
+aside-code("Example"). +aside-code("Example").
options = {'ents': ['PERSON', 'ORG', 'PRODUCT'], options = {'ents': ['PERSON', 'ORG', 'PRODUCT'],
@ -244,6 +242,6 @@ p
p p
| By default, displaCy comes with colours for all | By default, displaCy comes with colours for all
| #[+a("/docs/api/annotation#named-entities") entity types supported by spaCy]. | #[+a("/api/annotation#named-entities") entity types supported by spaCy].
| If you're using custom entity types, you can use the #[code colors] | If you're using custom entity types, you can use the #[code colors]
| setting to add your own colours for them. | setting to add your own colours for them.

View File

@ -1,15 +1,13 @@
//- 💫 DOCS > API > SPACY //- 💫 DOCS > API > TOP-LEVEL > SPACY
include ../../_includes/_mixins +h(3, "spacy.load") spacy.load
+h(2, "load") spacy.load
+tag function +tag function
+tag-model +tag-model
p p
| Load a model via its #[+a("/docs/usage/models#usage") shortcut link], | Load a model via its #[+a("/usage/models#usage") shortcut link],
| the name of an installed | the name of an installed
| #[+a("/docs/usage/saving-loading#generating") model package], a unicode | #[+a("/usage/training#models-generating") model package], a unicode
| path or a #[code Path]-like object. spaCy will try resolving the load | path or a #[code Path]-like object. spaCy will try resolving the load
| argument in this order. If a model is loaded from a shortcut link or | argument in this order. If a model is loaded from a shortcut link or
| package name, spaCy will assume it's a Python package and import it and | package name, spaCy will assume it's a Python package and import it and
@ -38,25 +36,57 @@ p
+cell list +cell list
+cell +cell
| Names of pipeline components to | Names of pipeline components to
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable]. | #[+a("/usage/processing-pipelines#disabling") disable].
+footrow +row("foot")
+cell returns +cell returns
+cell #[code Language] +cell #[code Language]
+cell A #[code Language] object with the loaded model. +cell A #[code Language] object with the loaded model.
+infobox("⚠️ Deprecation note") +infobox("Deprecation note", "⚠️")
.o-block .o-block
| As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy | As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy
| will also raise an error if no model could be loaded and never just | will also raise an error if no model could be loaded and never just
| return an empty #[code Language] object. If you need a blank language, | return an empty #[code Language] object. If you need a blank language,
| you need to import it explicitly (#[code from spacy.lang.en import English]) | you can use the new function #[+api("spacy#blank") #[code spacy.blank()]]
| or use #[+api("util#get_lang_class") #[code util.get_lang_class]]. | or import the class explicitly, e.g.
| #[code from spacy.lang.en import English].
+code-new nlp = spacy.load('/model') +code-new nlp = spacy.load('/model')
+code-old nlp = spacy.load('en', path='/model') +code-old nlp = spacy.load('en', path='/model')
+h(2, "info") spacy.info +h(3, "spacy.blank") spacy.blank
+tag function
+tag-new(2)
p
| Create a blank model of a given language class. This function is the
| twin of #[code spacy.load()].
+aside-code("Example").
nlp_en = spacy.blank('en')
nlp_de = spacy.blank('de')
+table(["Name", "Type", "Description"])
+row
+cell #[code name]
+cell unicode
+cell ISO code of the language class to load.
+row
+cell #[code disable]
+cell list
+cell
| Names of pipeline components to
| #[+a("/usage/processing-pipelines#disabling") disable].
+row("foot")
+cell returns
+cell #[code Language]
+cell An empty #[code Language] object of the appropriate subclass.
+h(4, "spacy.info") spacy.info
+tag function +tag function
p p
@ -83,13 +113,13 @@ p
+cell Print information as Markdown. +cell Print information as Markdown.
+h(2, "explain") spacy.explain +h(3, "spacy.explain") spacy.explain
+tag function +tag function
p p
| Get a description for a given POS tag, dependency label or entity type. | Get a description for a given POS tag, dependency label or entity type.
| For a list of available terms, see | For a list of available terms, see
| #[+src(gh("spacy", "spacy/glossary.py")) glossary.py]. | #[+src(gh("spacy", "spacy/glossary.py")) #[code glossary.py]].
+aside-code("Example"). +aside-code("Example").
spacy.explain('NORP') spacy.explain('NORP')
@ -107,18 +137,18 @@ p
+cell unicode +cell unicode
+cell Term to explain. +cell Term to explain.
+footrow +row("foot")
+cell returns +cell returns
+cell unicode +cell unicode
+cell The explanation, or #[code None] if not found in the glossary. +cell The explanation, or #[code None] if not found in the glossary.
+h(2, "set_factory") spacy.set_factory +h(3, "spacy.set_factory") spacy.set_factory
+tag function +tag function
+tag-new(2) +tag-new(2)
p p
| Set a factory that returns a custom | Set a factory that returns a custom
| #[+a("/docs/usage/language-processing-pipeline") processing pipeline] | #[+a("/usage/processing-pipelines") processing pipeline]
| component. Factories are useful for creating stateful components, especially ones which depend on shared data. | component. Factories are useful for creating stateful components, especially ones which depend on shared data.
+aside-code("Example"). +aside-code("Example").

View File

@ -1,10 +1,8 @@
//- 💫 DOCS > API > UTIL //- 💫 DOCS > API > TOP-LEVEL > UTIL
include ../../_includes/_mixins
p p
| spaCy comes with a small collection of utility functions located in | spaCy comes with a small collection of utility functions located in
| #[+src(gh("spaCy", "spacy/util.py")) spacy/util.py]. | #[+src(gh("spaCy", "spacy/util.py")) #[code spacy/util.py]].
| Because utility functions are mostly intended for | Because utility functions are mostly intended for
| #[strong internal use within spaCy], their behaviour may change with | #[strong internal use within spaCy], their behaviour may change with
| future releases. The functions documented on this page should be safe | future releases. The functions documented on this page should be safe
@ -12,7 +10,7 @@ p
| recommend having additional tests in place if your application depends on | recommend having additional tests in place if your application depends on
| any of spaCy's utilities. | any of spaCy's utilities.
+h(2, "get_data_path") util.get_data_path +h(3, "util.get_data_path") util.get_data_path
+tag function +tag function
p p
@ -25,12 +23,12 @@ p
+cell bool +cell bool
+cell Only return path if it exists, otherwise return #[code None]. +cell Only return path if it exists, otherwise return #[code None].
+footrow +row("foot")
+cell returns +cell returns
+cell #[code Path] / #[code None] +cell #[code Path] / #[code None]
+cell Data path or #[code None]. +cell Data path or #[code None].
+h(2, "set_data_path") util.set_data_path +h(3, "util.set_data_path") util.set_data_path
+tag function +tag function
p p
@ -47,12 +45,12 @@ p
+cell unicode or #[code Path] +cell unicode or #[code Path]
+cell Path to new data directory. +cell Path to new data directory.
+h(2, "get_lang_class") util.get_lang_class +h(3, "util.get_lang_class") util.get_lang_class
+tag function +tag function
p p
| Import and load a #[code Language] class. Allows lazy-loading | Import and load a #[code Language] class. Allows lazy-loading
| #[+a("/docs/usage/adding-languages") language data] and importing | #[+a("/usage/adding-languages") language data] and importing
| languages using the two-letter language code. | languages using the two-letter language code.
+aside-code("Example"). +aside-code("Example").
@ -67,12 +65,12 @@ p
+cell unicode +cell unicode
+cell Two-letter language code, e.g. #[code 'en']. +cell Two-letter language code, e.g. #[code 'en'].
+footrow +row("foot")
+cell returns +cell returns
+cell #[code Language] +cell #[code Language]
+cell Language class. +cell Language class.
+h(2, "load_model") util.load_model +h(3, "util.load_model") util.load_model
+tag function +tag function
+tag-new(2) +tag-new(2)
@ -101,12 +99,12 @@ p
+cell - +cell -
+cell Specific overrides, like pipeline components to disable. +cell Specific overrides, like pipeline components to disable.
+footrow +row("foot")
+cell returns +cell returns
+cell #[code Language] +cell #[code Language]
+cell #[code Language] class with the loaded model. +cell #[code Language] class with the loaded model.
+h(2, "load_model_from_path") util.load_model_from_path +h(3, "util.load_model_from_path") util.load_model_from_path
+tag function +tag function
+tag-new(2) +tag-new(2)
@ -139,18 +137,18 @@ p
+cell - +cell -
+cell Specific overrides, like pipeline components to disable. +cell Specific overrides, like pipeline components to disable.
+footrow +row("foot")
+cell returns +cell returns
+cell #[code Language] +cell #[code Language]
+cell #[code Language] class with the loaded model. +cell #[code Language] class with the loaded model.
+h(2, "load_model_from_init_py") util.load_model_from_init_py +h(3, "util.load_model_from_init_py") util.load_model_from_init_py
+tag function +tag function
+tag-new(2) +tag-new(2)
p p
| A helper function to use in the #[code load()] method of a model package's | A helper function to use in the #[code load()] method of a model package's
| #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) __init__.py]. | #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) #[code __init__.py]].
+aside-code("Example"). +aside-code("Example").
from spacy.util import load_model_from_init_py from spacy.util import load_model_from_init_py
@ -169,12 +167,12 @@ p
+cell - +cell -
+cell Specific overrides, like pipeline components to disable. +cell Specific overrides, like pipeline components to disable.
+footrow +row("foot")
+cell returns +cell returns
+cell #[code Language] +cell #[code Language]
+cell #[code Language] class with the loaded model. +cell #[code Language] class with the loaded model.
+h(2, "get_model_meta") util.get_model_meta +h(3, "util.get_model_meta") util.get_model_meta
+tag function +tag function
+tag-new(2) +tag-new(2)
@ -190,17 +188,17 @@ p
+cell unicode or #[code Path] +cell unicode or #[code Path]
+cell Path to model directory. +cell Path to model directory.
+footrow +row("foot")
+cell returns +cell returns
+cell dict +cell dict
+cell The model's meta data. +cell The model's meta data.
+h(2, "is_package") util.is_package +h(3, "util.is_package") util.is_package
+tag function +tag function
p p
| Check if string maps to a package installed via pip. Mainly used to | Check if string maps to a package installed via pip. Mainly used to
| validate #[+a("/docs/usage/models") model packages]. | validate #[+a("/usage/models") model packages].
+aside-code("Example"). +aside-code("Example").
util.is_package('en_core_web_sm') # True util.is_package('en_core_web_sm') # True
@ -212,18 +210,18 @@ p
+cell unicode +cell unicode
+cell Name of package. +cell Name of package.
+footrow +row("foot")
+cell returns +cell returns
+cell #[code bool] +cell #[code bool]
+cell #[code True] if installed package, #[code False] if not. +cell #[code True] if installed package, #[code False] if not.
+h(2, "get_package_path") util.get_package_path +h(3, "util.get_package_path") util.get_package_path
+tag function +tag function
+tag-new(2) +tag-new(2)
p p
| Get path to an installed package. Mainly used to resolve the location of | Get path to an installed package. Mainly used to resolve the location of
| #[+a("/docs/usage/models") model packages]. Currently imports the package | #[+a("/usage/models") model packages]. Currently imports the package
| to find its path. | to find its path.
+aside-code("Example"). +aside-code("Example").
@ -236,12 +234,12 @@ p
+cell unicode +cell unicode
+cell Name of installed package. +cell Name of installed package.
+footrow +row("foot")
+cell returns +cell returns
+cell #[code Path] +cell #[code Path]
+cell Path to model package directory. +cell Path to model package directory.
+h(2, "is_in_jupyter") util.is_in_jupyter +h(3, "util.is_in_jupyter") util.is_in_jupyter
+tag function +tag function
+tag-new(2) +tag-new(2)
@ -257,17 +255,17 @@ p
return display(HTML(html)) return display(HTML(html))
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+footrow +row("foot")
+cell returns +cell returns
+cell bool +cell bool
+cell #[code True] if in Jupyter, #[code False] if not. +cell #[code True] if in Jupyter, #[code False] if not.
+h(2, "update_exc") util.update_exc +h(3, "util.update_exc") util.update_exc
+tag function +tag function
p p
| Update, validate and overwrite | Update, validate and overwrite
| #[+a("/docs/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions]. | #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions].
| Used to combine global exceptions with custom, language-specific | Used to combine global exceptions with custom, language-specific
| exceptions. Will raise an error if key doesn't match #[code ORTH] values. | exceptions. Will raise an error if key doesn't match #[code ORTH] values.
@ -288,20 +286,20 @@ p
+cell dicts +cell dicts
+cell Exception dictionaries to add to the base exceptions, in order. +cell Exception dictionaries to add to the base exceptions, in order.
+footrow +row("foot")
+cell returns +cell returns
+cell dict +cell dict
+cell Combined tokenizer exceptions. +cell Combined tokenizer exceptions.
+h(2, "prints") util.prints +h(3, "util.prints") util.prints
+tag function +tag function
+tag-new(2) +tag-new(2)
p p
| Print a formatted, text-wrapped message with optional title. If a text | Print a formatted, text-wrapped message with optional title. If a text
| argument is a #[code Path], it's converted to a string. Should only | argument is a #[code Path], it's converted to a string. Should only
| be used for interactive components like the #[+api("cli") cli]. | be used for interactive components like the command-line interface.
+aside-code("Example"). +aside-code("Example").
data_path = Path('/some/path') data_path = Path('/some/path')

131
website/api/annotation.jade Normal file
View File

@ -0,0 +1,131 @@
//- 💫 DOCS > API > ANNOTATION SPECS
include ../_includes/_mixins
p This document describes the target annotations spaCy is trained to predict.
+section("tokenization")
+h(2, "tokenization") Tokenization
p
| Tokenization standards are based on the
| #[+a("https://catalog.ldc.upenn.edu/LDC2013T19") OntoNotes 5] corpus.
| The tokenizer differs from most by including tokens for significant
| whitespace. Any sequence of whitespace characters beyond a single space
| (#[code ' ']) is included as a token.
+aside-code("Example").
from spacy.lang.en import English
nlp = English()
tokens = nlp('Some\nspaces and\ttab characters')
tokens_text = [t.text for t in tokens]
assert tokens_text == ['Some', '\n', 'spaces', ' ', 'and',
'\t', 'tab', 'characters']
p
| The whitespace tokens are useful for much the same reason punctuation is
| it's often an important delimiter in the text. By preserving it in the
| token output, we are able to maintain a simple alignment between the
| tokens and the original string, and we ensure that no information is
| lost during processing.
+section("sbd")
+h(2, "sentence-boundary") Sentence boundary detection
p
| Sentence boundaries are calculated from the syntactic parse tree, so
| features such as punctuation and capitalisation play an important but
| non-decisive role in determining the sentence boundaries. Usually this
| means that the sentence boundaries will at least coincide with clause
| boundaries, even given poorly punctuated text.
+section("pos-tagging")
+h(2, "pos-tagging") Part-of-speech Tagging
+aside("Tip: Understanding tags")
| You can also use #[code spacy.explain()] to get the description for the
| string representation of a tag. For example,
| #[code spacy.explain("RB")] will return "adverb".
include _annotation/_pos-tags
+section("lemmatization")
+h(2, "lemmatization") Lemmatization
p A "lemma" is the uninflected form of a word. In English, this means:
+list
+item #[strong Adjectives]: The form like "happy", not "happier" or "happiest"
+item #[strong Adverbs]: The form like "badly", not "worse" or "worst"
+item #[strong Nouns]: The form like "dog", not "dogs"; like "child", not "children"
+item #[strong Verbs]: The form like "write", not "writes", "writing", "wrote" or "written"
p
| The lemmatization data is taken from
| #[+a("https://wordnet.princeton.edu") WordNet]. However, we also add a
| special case for pronouns: all pronouns are lemmatized to the special
| token #[code -PRON-].
+infobox("About spaCy's custom pronoun lemma")
| Unlike verbs and common nouns, there's no clear base form of a personal
| pronoun. Should the lemma of "me" be "I", or should we normalize person
| as well, giving "it" — or maybe "he"? spaCy's solution is to introduce a
| novel symbol, #[code -PRON-], which is used as the lemma for
| all personal pronouns.
+section("dependency-parsing")
+h(2, "dependency-parsing") Syntactic Dependency Parsing
+aside("Tip: Understanding labels")
| You can also use #[code spacy.explain()] to get the description for the
| string representation of a label. For example,
| #[code spacy.explain("prt")] will return "particle".
include _annotation/_dep-labels
+section("named-entities")
+h(2, "named-entities") Named Entity Recognition
+aside("Tip: Understanding entity types")
| You can also use #[code spacy.explain()] to get the description for the
| string representation of an entity label. For example,
| #[code spacy.explain("LANGUAGE")] will return "any named language".
include _annotation/_named-entities
+h(3, "biluo") BILUO Scheme
include _annotation/_biluo
+section("training")
+h(2, "json-input") JSON input format for training
+under-construction
p spaCy takes training data in the following format:
+code("Example structure").
doc: {
id: string,
paragraphs: [{
raw: string,
sents: [int],
tokens: [{
start: int,
tag: string,
head: int,
dep: string
}],
ner: [{
start: int,
end: int,
label: string
}],
brackets: [{
start: int,
end: int,
label: string
}]
}]
}

View File

@ -1,6 +1,6 @@
//- 💫 DOCS > API > BINDER //- 💫 DOCS > API > BINDER
include ../../_includes/_mixins include ../_includes/_mixins
p A container class for serializing collections of #[code Doc] objects. p A container class for serializing collections of #[code Doc] objects.

View File

@ -0,0 +1,5 @@
//- 💫 DOCS > API > DEPENDENCYPARSER
include ../_includes/_mixins
!=partial("pipe", { subclass: "DependencyParser", short: "parser", pipeline_id: "parser" })

Some files were not shown because too many files have changed in this diff Show More