mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
Wrap try/except around model saving
This commit is contained in:
commit
c6cd81f192
|
@ -1 +1,55 @@
|
|||
environment:
|
||||
|
||||
matrix:
|
||||
|
||||
# For Python versions available on Appveyor, see
|
||||
# http://www.appveyor.com/docs/installed-software#python
|
||||
# The list here is complete (excluding Python 2.6, which
|
||||
# isn't covered by this document) at the time of writing.
|
||||
|
||||
- PYTHON: "C:\\Python27"
|
||||
#- PYTHON: "C:\\Python33"
|
||||
#- PYTHON: "C:\\Python34"
|
||||
#- PYTHON: "C:\\Python35"
|
||||
#- PYTHON: "C:\\Python27-x64"
|
||||
#- PYTHON: "C:\\Python33-x64"
|
||||
#- DISTUTILS_USE_SDK: "1"
|
||||
#- PYTHON: "C:\\Python34-x64"
|
||||
#- DISTUTILS_USE_SDK: "1"
|
||||
#- PYTHON: "C:\\Python35-x64"
|
||||
- PYTHON: "C:\\Python36-x64"
|
||||
|
||||
install:
|
||||
# We need wheel installed to build wheels
|
||||
- "%PYTHON%\\python.exe -m pip install wheel"
|
||||
- "%PYTHON%\\python.exe -m pip install cython"
|
||||
- "%PYTHON%\\python.exe -m pip install -r requirements.txt"
|
||||
- "%PYTHON%\\python.exe -m pip install -e ."
|
||||
|
||||
build: off
|
||||
|
||||
test_script:
|
||||
# Put your test command here.
|
||||
# If you don't need to build C extensions on 64-bit Python 3.3 or 3.4,
|
||||
# you can remove "build.cmd" from the front of the command, as it's
|
||||
# only needed to support those cases.
|
||||
# Note that you must use the environment variable %PYTHON% to refer to
|
||||
# the interpreter you're using - Appveyor does not do anything special
|
||||
# to put the Python version you want to use on PATH.
|
||||
- "%PYTHON%\\python.exe -m pytest spacy/"
|
||||
|
||||
after_test:
|
||||
# This step builds your wheels.
|
||||
# Again, you only need build.cmd if you're building C extensions for
|
||||
# 64-bit Python 3.3/3.4. And you need to use %PYTHON% to get the correct
|
||||
# interpreter
|
||||
- "%PYTHON%\\python.exe setup.py bdist_wheel"
|
||||
|
||||
artifacts:
|
||||
# bdist_wheel puts your built wheel in the dist directory
|
||||
- path: dist\*
|
||||
|
||||
#on_success:
|
||||
# You can use this step to upload your artifacts to a public website.
|
||||
# See Appveyor's documentation for more details. Or you can simply
|
||||
# access your wheels from the Appveyor "artifacts" tab for your build.
|
||||
|
|
11
.buildkite/sdist.yml
Normal file
11
.buildkite/sdist.yml
Normal file
|
@ -0,0 +1,11 @@
|
|||
steps:
|
||||
-
|
||||
command: "fab env clean make test sdist"
|
||||
label: ":dizzy: :python:"
|
||||
artifact_paths: "dist/*.tar.gz"
|
||||
- wait
|
||||
- trigger: "spacy-sdist-against-models"
|
||||
label: ":dizzy: :hammer:"
|
||||
build:
|
||||
env:
|
||||
SPACY_VERSION: "{$SPACY_VERSION}"
|
4
.gitignore
vendored
4
.gitignore
vendored
|
@ -1,14 +1,12 @@
|
|||
# spaCy
|
||||
spacy/data/
|
||||
corpora/
|
||||
models/
|
||||
/models/
|
||||
keys/
|
||||
|
||||
# Website
|
||||
website/www/
|
||||
website/_deploy.sh
|
||||
website/package.json
|
||||
website/announcement.jade
|
||||
website/.gitignore
|
||||
|
||||
# Cython / C extensions
|
||||
|
|
|
@ -1,322 +0,0 @@
|
|||
'''WIP --- Doesn't work well yet'''
|
||||
import plac
|
||||
import random
|
||||
import six
|
||||
|
||||
import cProfile
|
||||
import pstats
|
||||
|
||||
import pathlib
|
||||
import cPickle as pickle
|
||||
from itertools import izip
|
||||
|
||||
import spacy
|
||||
|
||||
import cytoolz
|
||||
import cupy as xp
|
||||
import cupy.cuda
|
||||
import chainer.cuda
|
||||
|
||||
import chainer.links as L
|
||||
import chainer.functions as F
|
||||
from chainer import Chain, Variable, report
|
||||
import chainer.training
|
||||
import chainer.optimizers
|
||||
from chainer.training import extensions
|
||||
from chainer.iterators import SerialIterator
|
||||
from chainer.datasets import TupleDataset
|
||||
|
||||
|
||||
class SentimentAnalyser(object):
|
||||
@classmethod
|
||||
def load(cls, path, nlp, max_length=100):
|
||||
raise NotImplementedError
|
||||
#with (path / 'config.json').open() as file_:
|
||||
# model = model_from_json(file_.read())
|
||||
#with (path / 'model').open('rb') as file_:
|
||||
# lstm_weights = pickle.load(file_)
|
||||
#embeddings = get_embeddings(nlp.vocab)
|
||||
#model.set_weights([embeddings] + lstm_weights)
|
||||
#return cls(model, max_length=max_length)
|
||||
|
||||
def __init__(self, model, max_length=100):
|
||||
self._model = model
|
||||
self.max_length = max_length
|
||||
|
||||
def __call__(self, doc):
|
||||
X = get_features([doc], self.max_length)
|
||||
y = self._model.predict(X)
|
||||
self.set_sentiment(doc, y)
|
||||
|
||||
def pipe(self, docs, batch_size=1000, n_threads=2):
|
||||
for minibatch in cytoolz.partition_all(batch_size, docs):
|
||||
minibatch = list(minibatch)
|
||||
sentences = []
|
||||
for doc in minibatch:
|
||||
sentences.extend(doc.sents)
|
||||
Xs = get_features(sentences, self.max_length)
|
||||
ys = self._model.predict(Xs)
|
||||
for sent, label in zip(sentences, ys):
|
||||
sent.doc.sentiment += label - 0.5
|
||||
for doc in minibatch:
|
||||
yield doc
|
||||
|
||||
def set_sentiment(self, doc, y):
|
||||
doc.sentiment = float(y[0])
|
||||
# Sentiment has a native slot for a single float.
|
||||
# For arbitrary data storage, there's:
|
||||
# doc.user_data['my_data'] = y
|
||||
|
||||
|
||||
class Classifier(Chain):
|
||||
def __init__(self, predictor):
|
||||
super(Classifier, self).__init__(predictor=predictor)
|
||||
|
||||
def __call__(self, x, t):
|
||||
y = self.predictor(x)
|
||||
loss = F.softmax_cross_entropy(y, t)
|
||||
accuracy = F.accuracy(y, t)
|
||||
report({'loss': loss, 'accuracy': accuracy}, self)
|
||||
return loss
|
||||
|
||||
|
||||
class SentimentModel(Chain):
|
||||
def __init__(self, nlp, shape, **settings):
|
||||
Chain.__init__(self,
|
||||
embed=_Embed(shape['nr_vector'], shape['nr_dim'], shape['nr_hidden'],
|
||||
set_vectors=lambda arr: set_vectors(arr, nlp.vocab)),
|
||||
encode=_Encode(shape['nr_hidden'], shape['nr_hidden']),
|
||||
attend=_Attend(shape['nr_hidden'], shape['nr_hidden']),
|
||||
predict=_Predict(shape['nr_hidden'], shape['nr_class']))
|
||||
self.to_gpu(0)
|
||||
|
||||
def __call__(self, sentence):
|
||||
return self.predict(
|
||||
self.attend(
|
||||
self.encode(
|
||||
self.embed(sentence))))
|
||||
|
||||
|
||||
class _Embed(Chain):
|
||||
def __init__(self, nr_vector, nr_dim, nr_out, set_vectors=None):
|
||||
Chain.__init__(self,
|
||||
embed=L.EmbedID(nr_vector, nr_dim, initialW=set_vectors),
|
||||
project=L.Linear(None, nr_out, nobias=True))
|
||||
self.embed.W.volatile = False
|
||||
|
||||
def __call__(self, sentence):
|
||||
return [self.project(self.embed(ts)) for ts in F.transpose(sentence)]
|
||||
|
||||
|
||||
class _Encode(Chain):
|
||||
def __init__(self, nr_in, nr_out):
|
||||
Chain.__init__(self,
|
||||
fwd=L.LSTM(nr_in, nr_out),
|
||||
bwd=L.LSTM(nr_in, nr_out),
|
||||
mix=L.Bilinear(nr_out, nr_out, nr_out))
|
||||
|
||||
def __call__(self, sentence):
|
||||
self.fwd.reset_state()
|
||||
fwds = map(self.fwd, sentence)
|
||||
self.bwd.reset_state()
|
||||
bwds = reversed(map(self.bwd, reversed(sentence)))
|
||||
return [F.elu(self.mix(f, b)) for f, b in zip(fwds, bwds)]
|
||||
|
||||
|
||||
class _Attend(Chain):
|
||||
def __init__(self, nr_in, nr_out):
|
||||
Chain.__init__(self)
|
||||
|
||||
def __call__(self, sentence):
|
||||
sent = sum(sentence)
|
||||
return sent
|
||||
|
||||
|
||||
class _Predict(Chain):
|
||||
def __init__(self, nr_in, nr_out):
|
||||
Chain.__init__(self,
|
||||
l1=L.Linear(nr_in, nr_in),
|
||||
l2=L.Linear(nr_in, nr_out))
|
||||
|
||||
def __call__(self, vector):
|
||||
vector = self.l1(vector)
|
||||
vector = F.elu(vector)
|
||||
vector = self.l2(vector)
|
||||
return vector
|
||||
|
||||
|
||||
class SentenceDataset(TupleDataset):
|
||||
def __init__(self, nlp, texts, labels, max_length):
|
||||
self.max_length = max_length
|
||||
sents, labels = self._get_labelled_sentences(
|
||||
nlp.pipe(texts, batch_size=5000, n_threads=3),
|
||||
labels)
|
||||
TupleDataset.__init__(self,
|
||||
get_features(sents, max_length),
|
||||
labels)
|
||||
|
||||
def __getitem__(self, index):
|
||||
batches = [dataset[index] for dataset in self._datasets]
|
||||
if isinstance(index, slice):
|
||||
length = len(batches[0])
|
||||
returns = [tuple([batch[i] for batch in batches])
|
||||
for i in six.moves.range(length)]
|
||||
return returns
|
||||
else:
|
||||
return tuple(batches)
|
||||
|
||||
def _get_labelled_sentences(self, docs, doc_labels):
|
||||
labels = []
|
||||
sentences = []
|
||||
for doc, y in izip(docs, doc_labels):
|
||||
for sent in doc.sents:
|
||||
sentences.append(sent)
|
||||
labels.append(y)
|
||||
return sentences, xp.asarray(labels, dtype='i')
|
||||
|
||||
|
||||
class DocDataset(TupleDataset):
|
||||
def __init__(self, nlp, texts, labels):
|
||||
self.max_length = max_length
|
||||
DatasetMixin.__init__(self,
|
||||
get_features(
|
||||
nlp.pipe(texts, batch_size=5000, n_threads=3), self.max_length),
|
||||
labels)
|
||||
|
||||
def read_data(data_dir, limit=0):
|
||||
examples = []
|
||||
for subdir, label in (('pos', 1), ('neg', 0)):
|
||||
for filename in (data_dir / subdir).iterdir():
|
||||
with filename.open() as file_:
|
||||
text = file_.read()
|
||||
examples.append((text, label))
|
||||
random.shuffle(examples)
|
||||
if limit >= 1:
|
||||
examples = examples[:limit]
|
||||
return zip(*examples) # Unzips into two lists
|
||||
|
||||
|
||||
def get_features(docs, max_length):
|
||||
docs = list(docs)
|
||||
Xs = xp.zeros((len(docs), max_length), dtype='i')
|
||||
for i, doc in enumerate(docs):
|
||||
j = 0
|
||||
for token in doc:
|
||||
if token.has_vector and not token.is_punct and not token.is_space:
|
||||
Xs[i, j] = token.norm
|
||||
j += 1
|
||||
if j >= max_length:
|
||||
break
|
||||
return Xs
|
||||
|
||||
|
||||
def set_vectors(vectors, vocab):
|
||||
for lex in vocab:
|
||||
if lex.has_vector and (lex.rank+1) < vectors.shape[0]:
|
||||
lex.norm = lex.rank+1
|
||||
vectors[lex.rank + 1] = lex.vector
|
||||
else:
|
||||
lex.norm = 0
|
||||
return vectors
|
||||
|
||||
|
||||
def train(train_texts, train_labels, dev_texts, dev_labels,
|
||||
lstm_shape, lstm_settings, lstm_optimizer, batch_size=100, nb_epoch=5,
|
||||
by_sentence=True):
|
||||
nlp = spacy.load('en', entity=False)
|
||||
if 'nr_vector' not in lstm_shape:
|
||||
lstm_shape['nr_vector'] = max(lex.rank+1 for lex in nlp.vocab if lex.has_vector)
|
||||
if 'nr_dim' not in lstm_shape:
|
||||
lstm_shape['nr_dim'] = nlp.vocab.vectors_length
|
||||
print("Make model")
|
||||
model = Classifier(SentimentModel(nlp, lstm_shape, **lstm_settings))
|
||||
print("Parsing texts...")
|
||||
if by_sentence:
|
||||
train_data = SentenceDataset(nlp, train_texts, train_labels, lstm_shape['max_length'])
|
||||
dev_data = SentenceDataset(nlp, dev_texts, dev_labels, lstm_shape['max_length'])
|
||||
else:
|
||||
train_data = DocDataset(nlp, train_texts, train_labels)
|
||||
dev_data = DocDataset(nlp, dev_texts, dev_labels)
|
||||
train_iter = SerialIterator(train_data, batch_size=batch_size,
|
||||
shuffle=True, repeat=True)
|
||||
dev_iter = SerialIterator(dev_data, batch_size=batch_size,
|
||||
shuffle=False, repeat=False)
|
||||
optimizer = chainer.optimizers.Adam()
|
||||
optimizer.setup(model)
|
||||
updater = chainer.training.StandardUpdater(train_iter, optimizer, device=0)
|
||||
trainer = chainer.training.Trainer(updater, (1, 'epoch'), out='result')
|
||||
|
||||
trainer.extend(extensions.Evaluator(dev_iter, model, device=0))
|
||||
trainer.extend(extensions.LogReport())
|
||||
trainer.extend(extensions.PrintReport([
|
||||
'epoch', 'main/accuracy', 'validation/main/accuracy']))
|
||||
trainer.extend(extensions.ProgressBar())
|
||||
|
||||
trainer.run()
|
||||
|
||||
|
||||
def evaluate(model_dir, texts, labels, max_length=100):
|
||||
def create_pipeline(nlp):
|
||||
'''
|
||||
This could be a lambda, but named functions are easier to read in Python.
|
||||
'''
|
||||
return [nlp.tagger, nlp.parser, SentimentAnalyser.load(model_dir, nlp,
|
||||
max_length=max_length)]
|
||||
|
||||
nlp = spacy.load('en')
|
||||
nlp.pipeline = create_pipeline(nlp)
|
||||
|
||||
correct = 0
|
||||
i = 0
|
||||
for doc in nlp.pipe(texts, batch_size=1000, n_threads=4):
|
||||
correct += bool(doc.sentiment >= 0.5) == bool(labels[i])
|
||||
i += 1
|
||||
return float(correct) / i
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
train_dir=("Location of training file or directory"),
|
||||
dev_dir=("Location of development file or directory"),
|
||||
model_dir=("Location of output model directory",),
|
||||
is_runtime=("Demonstrate run-time usage", "flag", "r", bool),
|
||||
nr_hidden=("Number of hidden units", "option", "H", int),
|
||||
max_length=("Maximum sentence length", "option", "L", int),
|
||||
dropout=("Dropout", "option", "d", float),
|
||||
learn_rate=("Learn rate", "option", "e", float),
|
||||
nb_epoch=("Number of training epochs", "option", "i", int),
|
||||
batch_size=("Size of minibatches for training LSTM", "option", "b", int),
|
||||
nr_examples=("Limit to N examples", "option", "n", int)
|
||||
)
|
||||
def main(model_dir, train_dir, dev_dir,
|
||||
is_runtime=False,
|
||||
nr_hidden=64, max_length=100, # Shape
|
||||
dropout=0.5, learn_rate=0.001, # General NN config
|
||||
nb_epoch=5, batch_size=32, nr_examples=-1): # Training params
|
||||
model_dir = pathlib.Path(model_dir)
|
||||
train_dir = pathlib.Path(train_dir)
|
||||
dev_dir = pathlib.Path(dev_dir)
|
||||
if is_runtime:
|
||||
dev_texts, dev_labels = read_data(dev_dir)
|
||||
acc = evaluate(model_dir, dev_texts, dev_labels, max_length=max_length)
|
||||
print(acc)
|
||||
else:
|
||||
print("Read data")
|
||||
train_texts, train_labels = read_data(train_dir, limit=nr_examples)
|
||||
dev_texts, dev_labels = read_data(dev_dir, limit=nr_examples)
|
||||
print("Using GPU 0")
|
||||
#chainer.cuda.get_device(0).use()
|
||||
train_labels = xp.asarray(train_labels, dtype='i')
|
||||
dev_labels = xp.asarray(dev_labels, dtype='i')
|
||||
lstm = train(train_texts, train_labels, dev_texts, dev_labels,
|
||||
{'nr_hidden': nr_hidden, 'max_length': max_length, 'nr_class': 2,
|
||||
'nr_vector': 5000},
|
||||
{'dropout': 0.5, 'lr': learn_rate},
|
||||
{},
|
||||
nb_epoch=nb_epoch, batch_size=batch_size)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
#cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof")
|
||||
#s = pstats.Stats("Profile.prof")
|
||||
#s.strip_dirs().sort_stats("time").print_stats()
|
||||
plac.call(main)
|
|
@ -20,71 +20,71 @@ The algorithm is O(n) at run-time for document of length n because we're only ev
|
|||
matching over the tag patterns. So no matter how many phrases we're looking for,
|
||||
our pattern set stays very small (exact size depends on the maximum length we're
|
||||
looking for, as the query language currently has no quantifiers)
|
||||
|
||||
The example expects a .bz2 file from the Reddit corpus, and a patterns file,
|
||||
formatted in jsonl as a sequence of entries like this:
|
||||
|
||||
{"text":"Anchorage"}
|
||||
{"text":"Angola"}
|
||||
{"text":"Ann Arbor"}
|
||||
{"text":"Annapolis"}
|
||||
{"text":"Appalachia"}
|
||||
{"text":"Argentina"}
|
||||
"""
|
||||
from __future__ import print_function, unicode_literals, division
|
||||
from ast import literal_eval
|
||||
from bz2 import BZ2File
|
||||
import time
|
||||
import math
|
||||
import codecs
|
||||
|
||||
import plac
|
||||
import ujson
|
||||
|
||||
from preshed.maps import PreshMap
|
||||
from preshed.counter import PreshCounter
|
||||
from spacy.strings import hash_string
|
||||
from spacy.en import English
|
||||
from spacy.matcher import PhraseMatcher
|
||||
import spacy
|
||||
|
||||
|
||||
def read_gazetteer(tokenizer, loc, n=-1):
|
||||
for i, line in enumerate(open(loc)):
|
||||
phrase = literal_eval('u' + line.strip())
|
||||
if ' (' in phrase and phrase.endswith(')'):
|
||||
phrase = phrase.split(' (', 1)[0]
|
||||
if i >= n:
|
||||
break
|
||||
phrase = tokenizer(phrase)
|
||||
if all((t.is_lower and t.prob >= -10) for t in phrase):
|
||||
continue
|
||||
data = ujson.loads(line.strip())
|
||||
phrase = tokenizer(data['text'])
|
||||
for w in phrase:
|
||||
_ = tokenizer.vocab[w.text]
|
||||
if len(phrase) >= 2:
|
||||
yield phrase
|
||||
|
||||
|
||||
def read_text(bz2_loc):
|
||||
def read_text(bz2_loc, n=10000):
|
||||
with BZ2File(bz2_loc) as file_:
|
||||
for line in file_:
|
||||
yield line.decode('utf8')
|
||||
for i, line in enumerate(file_):
|
||||
data = ujson.loads(line)
|
||||
yield data['body']
|
||||
if i >= n:
|
||||
break
|
||||
|
||||
|
||||
def get_matches(tokenizer, phrases, texts, max_length=6):
|
||||
matcher = PhraseMatcher(tokenizer.vocab, phrases, max_length=max_length)
|
||||
print("Match")
|
||||
matcher = PhraseMatcher(tokenizer.vocab, max_length=max_length)
|
||||
matcher.add('Phrase', None, *phrases)
|
||||
for text in texts:
|
||||
doc = tokenizer(text)
|
||||
for w in doc:
|
||||
_ = doc.vocab[w.text]
|
||||
matches = matcher(doc)
|
||||
for mwe in doc.ents:
|
||||
yield mwe
|
||||
for ent_id, start, end in matches:
|
||||
yield (ent_id, doc[start:end].text)
|
||||
|
||||
|
||||
def main(patterns_loc, text_loc, counts_loc, n=10000000):
|
||||
nlp = English(parser=False, tagger=False, entity=False)
|
||||
print("Make matcher")
|
||||
phrases = read_gazetteer(nlp.tokenizer, patterns_loc, n=n)
|
||||
counts = PreshCounter()
|
||||
def main(patterns_loc, text_loc, n=10000):
|
||||
nlp = spacy.blank('en')
|
||||
nlp.vocab.lex_attr_getters = {}
|
||||
phrases = read_gazetteer(nlp.tokenizer, patterns_loc)
|
||||
count = 0
|
||||
t1 = time.time()
|
||||
for mwe in get_matches(nlp.tokenizer, phrases, read_text(text_loc)):
|
||||
counts.inc(hash_string(mwe.text), 1)
|
||||
for ent_id, text in get_matches(nlp.tokenizer, phrases, read_text(text_loc, n=n)):
|
||||
count += 1
|
||||
t2 = time.time()
|
||||
print("10m tokens in %d s" % (t2 - t1))
|
||||
|
||||
with codecs.open(counts_loc, 'w', 'utf8') as file_:
|
||||
for phrase in read_gazetteer(nlp.tokenizer, patterns_loc, n=n):
|
||||
text = phrase.string
|
||||
key = hash_string(text)
|
||||
count = counts[key]
|
||||
if count != 0:
|
||||
file_.write('%d\t%s\n' % (count, text))
|
||||
print("%d docs in %.3f s. %d matches" % (n, (t2 - t1), count))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
|
@ -13,24 +13,29 @@ Input data:
|
|||
https://www.lt.informatik.tu-darmstadt.de/fileadmin/user_upload/Group_LangTech/data/GermEval2014_complete_data.zip
|
||||
|
||||
Developed for: spaCy 1.7.1
|
||||
Last tested for: spaCy 1.7.1
|
||||
Last tested for: spaCy 2.0.0a13
|
||||
'''
|
||||
from __future__ import unicode_literals, print_function
|
||||
import plac
|
||||
from pathlib import Path
|
||||
import random
|
||||
import json
|
||||
import tqdm
|
||||
|
||||
from thinc.neural.optimizers import Adam
|
||||
from thinc.neural.ops import NumpyOps
|
||||
|
||||
import spacy.orth as orth_funcs
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.pipeline import BeamEntityRecognizer
|
||||
from spacy.pipeline import EntityRecognizer
|
||||
from spacy.pipeline import TokenVectorEncoder, NeuralEntityRecognizer
|
||||
from spacy.tokenizer import Tokenizer
|
||||
from spacy.tokens import Doc
|
||||
from spacy.attrs import *
|
||||
from spacy.gold import GoldParse
|
||||
from spacy.gold import _iob_to_biluo as iob_to_biluo
|
||||
from spacy.gold import iob_to_biluo
|
||||
from spacy.gold import minibatch
|
||||
from spacy.scorer import Scorer
|
||||
import spacy.util
|
||||
|
||||
|
||||
try:
|
||||
unicode
|
||||
|
@ -38,96 +43,38 @@ except NameError:
|
|||
unicode = str
|
||||
|
||||
|
||||
spacy.util.set_env_log(True)
|
||||
|
||||
|
||||
def init_vocab():
|
||||
return Vocab(
|
||||
lex_attr_getters={
|
||||
LOWER: lambda string: string.lower(),
|
||||
SHAPE: orth_funcs.word_shape,
|
||||
NORM: lambda string: string.lower(),
|
||||
PREFIX: lambda string: string[0],
|
||||
SUFFIX: lambda string: string[-3:],
|
||||
CLUSTER: lambda string: 0,
|
||||
IS_ALPHA: orth_funcs.is_alpha,
|
||||
IS_ASCII: orth_funcs.is_ascii,
|
||||
IS_DIGIT: lambda string: string.isdigit(),
|
||||
IS_LOWER: orth_funcs.is_lower,
|
||||
IS_PUNCT: orth_funcs.is_punct,
|
||||
IS_SPACE: lambda string: string.isspace(),
|
||||
IS_TITLE: orth_funcs.is_title,
|
||||
IS_UPPER: orth_funcs.is_upper,
|
||||
IS_STOP: lambda string: False,
|
||||
IS_OOV: lambda string: True
|
||||
})
|
||||
|
||||
|
||||
def save_vocab(vocab, path):
|
||||
path = Path(path)
|
||||
if not path.exists():
|
||||
path.mkdir()
|
||||
elif not path.is_dir():
|
||||
raise IOError("Can't save vocab to %s\nNot a directory" % path)
|
||||
with (path / 'strings.json').open('w') as file_:
|
||||
vocab.strings.dump(file_)
|
||||
vocab.dump((path / 'lexemes.bin').as_posix())
|
||||
|
||||
|
||||
def load_vocab(path):
|
||||
path = Path(path)
|
||||
if not path.exists():
|
||||
raise IOError("Cannot load vocab from %s\nDoes not exist" % path)
|
||||
if not path.is_dir():
|
||||
raise IOError("Cannot load vocab from %s\nNot a directory" % path)
|
||||
return Vocab.load(path)
|
||||
|
||||
|
||||
def init_ner_model(vocab, features=None):
|
||||
if features is None:
|
||||
features = tuple(EntityRecognizer.feature_templates)
|
||||
return EntityRecognizer(vocab, features=features)
|
||||
|
||||
|
||||
def save_ner_model(model, path):
|
||||
path = Path(path)
|
||||
if not path.exists():
|
||||
path.mkdir()
|
||||
if not path.is_dir():
|
||||
raise IOError("Can't save model to %s\nNot a directory" % path)
|
||||
model.model.dump((path / 'model').as_posix())
|
||||
with (path / 'config.json').open('w') as file_:
|
||||
data = json.dumps(model.cfg)
|
||||
if not isinstance(data, unicode):
|
||||
data = data.decode('utf8')
|
||||
file_.write(data)
|
||||
|
||||
|
||||
def load_ner_model(vocab, path):
|
||||
return EntityRecognizer.load(path, vocab)
|
||||
|
||||
|
||||
class Pipeline(object):
|
||||
@classmethod
|
||||
def load(cls, path):
|
||||
path = Path(path)
|
||||
if not path.exists():
|
||||
raise IOError("Cannot load pipeline from %s\nDoes not exist" % path)
|
||||
if not path.is_dir():
|
||||
raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
|
||||
vocab = load_vocab(path)
|
||||
tokenizer = Tokenizer(vocab, {}, None, None, None)
|
||||
ner_model = load_ner_model(vocab, path / 'ner')
|
||||
return cls(vocab, tokenizer, ner_model)
|
||||
|
||||
def __init__(self, vocab=None, tokenizer=None, entity=None):
|
||||
if vocab is None:
|
||||
vocab = init_vocab()
|
||||
if tokenizer is None:
|
||||
tokenizer = Tokenizer(vocab, {}, None, None, None)
|
||||
if entity is None:
|
||||
entity = init_ner_model(self.vocab)
|
||||
entity = NeuralEntityRecognizer(vocab)
|
||||
self.vocab = vocab
|
||||
self.tokenizer = tokenizer
|
||||
self.entity = entity
|
||||
self.pipeline = [self.entity]
|
||||
|
||||
def begin_training(self):
|
||||
for model in self.pipeline:
|
||||
model.begin_training([])
|
||||
optimizer = Adam(NumpyOps(), 0.001)
|
||||
return optimizer
|
||||
|
||||
def __call__(self, input_):
|
||||
doc = self.make_doc(input_)
|
||||
for process in self.pipeline:
|
||||
|
@ -147,14 +94,16 @@ class Pipeline(object):
|
|||
gold = GoldParse(doc, entities=annotations)
|
||||
return gold
|
||||
|
||||
def update(self, input_, annot):
|
||||
doc = self.make_doc(input_)
|
||||
gold = self.make_gold(input_, annot)
|
||||
for ner in gold.ner:
|
||||
if ner not in (None, '-', 'O'):
|
||||
action, label = ner.split('-', 1)
|
||||
self.entity.add_label(label)
|
||||
return self.entity.update(doc, gold)
|
||||
def update(self, inputs, annots, sgd, losses=None, drop=0.):
|
||||
if losses is None:
|
||||
losses = {}
|
||||
docs = [self.make_doc(input_) for input_ in inputs]
|
||||
golds = [self.make_gold(input_, annot) for input_, annot in
|
||||
zip(inputs, annots)]
|
||||
|
||||
self.entity.update(docs, golds, drop=drop,
|
||||
sgd=sgd, losses=losses)
|
||||
return losses
|
||||
|
||||
def evaluate(self, examples):
|
||||
scorer = Scorer()
|
||||
|
@ -164,34 +113,36 @@ class Pipeline(object):
|
|||
scorer.score(doc, gold)
|
||||
return scorer.scores
|
||||
|
||||
def average_weights(self):
|
||||
self.entity.model.end_training()
|
||||
|
||||
def save(self, path):
|
||||
def to_disk(self, path):
|
||||
path = Path(path)
|
||||
if not path.exists():
|
||||
path.mkdir()
|
||||
elif not path.is_dir():
|
||||
raise IOError("Can't save pipeline to %s\nNot a directory" % path)
|
||||
save_vocab(self.vocab, path / 'vocab')
|
||||
save_ner_model(self.entity, path / 'ner')
|
||||
self.vocab.to_disk(path / 'vocab')
|
||||
self.entity.to_disk(path / 'ner')
|
||||
|
||||
def from_disk(self, path):
|
||||
path = Path(path)
|
||||
if not path.exists():
|
||||
raise IOError("Cannot load pipeline from %s\nDoes not exist" % path)
|
||||
if not path.is_dir():
|
||||
raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
|
||||
self.vocab = self.vocab.from_disk(path / 'vocab')
|
||||
self.entity = self.entity.from_disk(path / 'ner')
|
||||
|
||||
|
||||
def train(nlp, train_examples, dev_examples, ctx, nr_epoch=5):
|
||||
next_epoch = train_examples
|
||||
def train(nlp, train_examples, dev_examples, nr_epoch=5):
|
||||
sgd = nlp.begin_training()
|
||||
print("Iter", "Loss", "P", "R", "F")
|
||||
for i in range(nr_epoch):
|
||||
this_epoch = next_epoch
|
||||
next_epoch = []
|
||||
loss = 0
|
||||
for input_, annot in this_epoch:
|
||||
loss += nlp.update(input_, annot)
|
||||
if (i+1) < nr_epoch:
|
||||
next_epoch.append((input_, annot))
|
||||
random.shuffle(next_epoch)
|
||||
random.shuffle(train_examples)
|
||||
losses = {}
|
||||
for batch in minibatch(tqdm.tqdm(train_examples, leave=False), size=8):
|
||||
inputs, annots = zip(*batch)
|
||||
nlp.update(list(inputs), list(annots), sgd, losses=losses)
|
||||
scores = nlp.evaluate(dev_examples)
|
||||
report_scores(i, loss, scores)
|
||||
nlp.average_weights()
|
||||
report_scores(i, losses['ner'], scores)
|
||||
scores = nlp.evaluate(dev_examples)
|
||||
report_scores(channels, i+1, loss, scores)
|
||||
|
||||
|
@ -208,7 +159,8 @@ def read_examples(path):
|
|||
with path.open() as file_:
|
||||
sents = file_.read().strip().split('\n\n')
|
||||
for sent in sents:
|
||||
if not sent.strip():
|
||||
sent = sent.strip()
|
||||
if not sent:
|
||||
continue
|
||||
tokens = sent.split('\n')
|
||||
while tokens and tokens[0].startswith('#'):
|
||||
|
@ -217,28 +169,39 @@ def read_examples(path):
|
|||
iob = []
|
||||
for token in tokens:
|
||||
if token.strip():
|
||||
pieces = token.split()
|
||||
pieces = token.split('\t')
|
||||
words.append(pieces[1])
|
||||
iob.append(pieces[2])
|
||||
yield words, iob_to_biluo(iob)
|
||||
|
||||
|
||||
def get_labels(examples):
|
||||
labels = set()
|
||||
for words, tags in examples:
|
||||
for tag in tags:
|
||||
if '-' in tag:
|
||||
labels.add(tag.split('-')[1])
|
||||
return sorted(labels)
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
model_dir=("Path to save the model", "positional", None, Path),
|
||||
train_loc=("Path to your training data", "positional", None, Path),
|
||||
dev_loc=("Path to your development data", "positional", None, Path),
|
||||
)
|
||||
def main(model_dir=Path('/home/matt/repos/spaCy/spacy/data/de-1.0.0'),
|
||||
train_loc=None, dev_loc=None, nr_epoch=30):
|
||||
|
||||
train_examples = read_examples(train_loc)
|
||||
def main(model_dir, train_loc, dev_loc, nr_epoch=30):
|
||||
print(model_dir, train_loc, dev_loc)
|
||||
train_examples = list(read_examples(train_loc))
|
||||
dev_examples = read_examples(dev_loc)
|
||||
nlp = Pipeline.load(model_dir)
|
||||
nlp = Pipeline()
|
||||
for label in get_labels(train_examples):
|
||||
nlp.entity.add_label(label)
|
||||
print("Add label", label)
|
||||
|
||||
train(nlp, train_examples, list(dev_examples), ctx, nr_epoch)
|
||||
train(nlp, train_examples, list(dev_examples), nr_epoch)
|
||||
|
||||
nlp.save(model_dir)
|
||||
nlp.to_disk(model_dir)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
plac.call(main)
|
||||
|
|
|
@ -25,7 +25,7 @@ For more details, see the documentation:
|
|||
* Saving and loading models: https://spacy.io/docs/usage/saving-loading
|
||||
|
||||
Developed for: spaCy 1.7.6
|
||||
Last tested for: spaCy 1.7.6
|
||||
Last updated for: spaCy 2.0.0a13
|
||||
"""
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
|
@ -34,55 +34,41 @@ from pathlib import Path
|
|||
import random
|
||||
|
||||
import spacy
|
||||
from spacy.gold import GoldParse
|
||||
from spacy.tagger import Tagger
|
||||
from spacy.gold import GoldParse, minibatch
|
||||
from spacy.pipeline import NeuralEntityRecognizer
|
||||
from spacy.pipeline import TokenVectorEncoder
|
||||
|
||||
|
||||
def get_gold_parses(tokenizer, train_data):
|
||||
'''Shuffle and create GoldParse objects'''
|
||||
random.shuffle(train_data)
|
||||
for raw_text, entity_offsets in train_data:
|
||||
doc = tokenizer(raw_text)
|
||||
gold = GoldParse(doc, entities=entity_offsets)
|
||||
yield doc, gold
|
||||
|
||||
|
||||
def train_ner(nlp, train_data, output_dir):
|
||||
# Add new words to vocab
|
||||
for raw_text, _ in train_data:
|
||||
doc = nlp.make_doc(raw_text)
|
||||
for word in doc:
|
||||
_ = nlp.vocab[word.orth]
|
||||
random.seed(0)
|
||||
# You may need to change the learning rate. It's generally difficult to
|
||||
# guess what rate you should set, especially when you have limited data.
|
||||
nlp.entity.model.learn_rate = 0.001
|
||||
for itn in range(1000):
|
||||
random.shuffle(train_data)
|
||||
loss = 0.
|
||||
for raw_text, entity_offsets in train_data:
|
||||
gold = GoldParse(doc, entities=entity_offsets)
|
||||
# By default, the GoldParse class assumes that the entities
|
||||
# described by offset are complete, and all other words should
|
||||
# have the tag 'O'. You can tell it to make no assumptions
|
||||
# about the tag of a word by giving it the tag '-'.
|
||||
# However, this allows a trivial solution to the current
|
||||
# learning problem: if words are either 'any tag' or 'ANIMAL',
|
||||
# the model can learn that all words can be tagged 'ANIMAL'.
|
||||
#for i in range(len(gold.ner)):
|
||||
#if not gold.ner[i].endswith('ANIMAL'):
|
||||
# gold.ner[i] = '-'
|
||||
doc = nlp.make_doc(raw_text)
|
||||
nlp.tagger(doc)
|
||||
# As of 1.9, spaCy's parser now lets you supply a dropout probability
|
||||
# This might help the model generalize better from only a few
|
||||
# examples.
|
||||
loss += nlp.entity.update(doc, gold, drop=0.9)
|
||||
if loss == 0:
|
||||
break
|
||||
# This step averages the model's weights. This may or may not be good for
|
||||
# your situation --- it's empirical.
|
||||
nlp.end_training()
|
||||
if output_dir:
|
||||
if not output_dir.exists():
|
||||
optimizer = nlp.begin_training(lambda: [])
|
||||
nlp.meta['name'] = 'en_ent_animal'
|
||||
for itn in range(50):
|
||||
losses = {}
|
||||
for batch in minibatch(get_gold_parses(nlp.make_doc, train_data), size=3):
|
||||
docs, golds = zip(*batch)
|
||||
nlp.update(docs, golds, losses=losses, sgd=optimizer, update_shared=True,
|
||||
drop=0.35)
|
||||
print(losses)
|
||||
if not output_dir:
|
||||
return
|
||||
elif not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
nlp.save_to_directory(output_dir)
|
||||
nlp.to_disk(output_dir)
|
||||
|
||||
|
||||
def main(model_name, output_directory=None):
|
||||
print("Loading initial model", model_name)
|
||||
nlp = spacy.load(model_name)
|
||||
print("Creating initial model", model_name)
|
||||
nlp = spacy.blank(model_name)
|
||||
if output_directory is not None:
|
||||
output_directory = Path(output_directory)
|
||||
|
||||
|
@ -91,6 +77,11 @@ def main(model_name, output_directory=None):
|
|||
"Horses are too tall and they pretend to care about your feelings",
|
||||
[(0, 6, 'ANIMAL')],
|
||||
),
|
||||
(
|
||||
"Do they bite?",
|
||||
[],
|
||||
),
|
||||
|
||||
(
|
||||
"horses are too tall and they pretend to care about your feelings",
|
||||
[(0, 6, 'ANIMAL')]
|
||||
|
@ -109,18 +100,20 @@ def main(model_name, output_directory=None):
|
|||
)
|
||||
|
||||
]
|
||||
nlp.entity.add_label('ANIMAL')
|
||||
nlp.pipeline.append(TokenVectorEncoder(nlp.vocab))
|
||||
nlp.pipeline.append(NeuralEntityRecognizer(nlp.vocab))
|
||||
nlp.pipeline[-1].add_label('ANIMAL')
|
||||
train_ner(nlp, train_data, output_directory)
|
||||
|
||||
# Test that the entity is recognized
|
||||
doc = nlp('Do you like horses?')
|
||||
text = 'Do you like horses?'
|
||||
print("Ents in 'Do you like horses?':")
|
||||
doc = nlp(text)
|
||||
for ent in doc.ents:
|
||||
print(ent.label_, ent.text)
|
||||
if output_directory:
|
||||
print("Loading from", output_directory)
|
||||
nlp2 = spacy.load('en', path=output_directory)
|
||||
nlp2.entity.add_label('ANIMAL')
|
||||
nlp2 = spacy.load(output_directory)
|
||||
doc2 = nlp2('Do you like horses?')
|
||||
for ent in doc2.ents:
|
||||
print(ent.label_, ent.text)
|
||||
|
|
|
@ -1,3 +1,7 @@
|
|||
'''Train a multi-label convolutional neural network text classifier,
|
||||
using the spacy.pipeline.TextCategorizer component. The model is then added
|
||||
to spacy.pipeline, and predictions are available at `doc.cats`.
|
||||
'''
|
||||
from __future__ import unicode_literals
|
||||
import plac
|
||||
import random
|
||||
|
@ -12,6 +16,11 @@ from spacy.gold import GoldParse, minibatch
|
|||
from spacy.util import compounding
|
||||
from spacy.pipeline import TextCategorizer
|
||||
|
||||
# TODO: Remove this once we're not supporting models trained with thinc <6.9.0
|
||||
import thinc.neural._classes.layernorm
|
||||
thinc.neural._classes.layernorm.set_compat_six_eight(False)
|
||||
|
||||
|
||||
|
||||
def train_textcat(tokenizer, textcat,
|
||||
train_texts, train_cats, dev_texts, dev_cats,
|
||||
|
@ -24,14 +33,15 @@ def train_textcat(tokenizer, textcat,
|
|||
train_docs = [tokenizer(text) for text in train_texts]
|
||||
train_gold = [GoldParse(doc, cats=cats) for doc, cats in
|
||||
zip(train_docs, train_cats)]
|
||||
train_data = zip(train_docs, train_gold)
|
||||
train_data = list(zip(train_docs, train_gold))
|
||||
batch_sizes = compounding(4., 128., 1.001)
|
||||
for i in range(n_iter):
|
||||
losses = {}
|
||||
train_data = tqdm.tqdm(train_data, leave=False) # Progress bar
|
||||
for batch in minibatch(train_data, size=batch_sizes):
|
||||
# Progress bar and minibatching
|
||||
batches = minibatch(tqdm.tqdm(train_data, leave=False), size=batch_sizes)
|
||||
for batch in batches:
|
||||
docs, golds = zip(*batch)
|
||||
textcat.update((docs, None), golds, sgd=optimizer, drop=0.2,
|
||||
textcat.update(docs, golds, sgd=optimizer, drop=0.2,
|
||||
losses=losses)
|
||||
with textcat.model.use_params(optimizer.averages):
|
||||
scores = evaluate(tokenizer, textcat, dev_texts, dev_cats)
|
||||
|
@ -61,12 +71,13 @@ def evaluate(tokenizer, textcat, texts, cats):
|
|||
return {'textcat_p': precis, 'textcat_r': recall, 'textcat_f': fscore}
|
||||
|
||||
|
||||
def load_data():
|
||||
def load_data(limit=0):
|
||||
# Partition off part of the train data --- avoid running experiments
|
||||
# against test.
|
||||
train_data, _ = thinc.extra.datasets.imdb()
|
||||
|
||||
random.shuffle(train_data)
|
||||
train_data = train_data[-limit:]
|
||||
|
||||
texts, labels = zip(*train_data)
|
||||
cats = [(['POSITIVE'] if y else []) for y in labels]
|
||||
|
@ -86,7 +97,7 @@ def main(model_loc=None):
|
|||
textcat = TextCategorizer(tokenizer.vocab, labels=['POSITIVE'])
|
||||
|
||||
print("Load IMDB data")
|
||||
(train_texts, train_cats), (dev_texts, dev_cats) = load_data()
|
||||
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=1000)
|
||||
|
||||
print("Itn.\tLoss\tP\tR\tF")
|
||||
progress = '{i:d} {loss:.3f} {textcat_p:.3f} {textcat_r:.3f} {textcat_f:.3f}'
|
||||
|
|
30
examples/vectors_fast_text.py
Normal file
30
examples/vectors_fast_text.py
Normal file
|
@ -0,0 +1,30 @@
|
|||
'''Load vectors for a language trained using FastText
|
||||
|
||||
https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
|
||||
'''
|
||||
from __future__ import unicode_literals
|
||||
import plac
|
||||
import numpy
|
||||
|
||||
import spacy.language
|
||||
|
||||
|
||||
def main(vectors_loc):
|
||||
nlp = spacy.language.Language()
|
||||
|
||||
with open(vectors_loc, 'rb') as file_:
|
||||
header = file_.readline()
|
||||
nr_row, nr_dim = header.split()
|
||||
nlp.vocab.clear_vectors(int(nr_dim))
|
||||
for line in file_:
|
||||
line = line.decode('utf8')
|
||||
pieces = line.split()
|
||||
word = pieces[0]
|
||||
vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
|
||||
nlp.vocab.set_vector(word, vector)
|
||||
doc = nlp(u'class colspan')
|
||||
print(doc[0].similarity(doc[1]))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
5
fabfile.py
vendored
5
fabfile.py
vendored
|
@ -14,6 +14,7 @@ VENV_DIR = path.join(PWD, ENV)
|
|||
def env(lang='python2.7'):
|
||||
if path.exists(VENV_DIR):
|
||||
local('rm -rf {env}'.format(env=VENV_DIR))
|
||||
local('pip install virtualenv')
|
||||
local('python -m virtualenv -p {lang} {env}'.format(lang=lang, env=VENV_DIR))
|
||||
|
||||
|
||||
|
@ -32,6 +33,10 @@ def make():
|
|||
local('pip install -r requirements.txt')
|
||||
local('python setup.py build_ext --inplace')
|
||||
|
||||
def sdist():
|
||||
with virtualenv(VENV_DIR):
|
||||
with lcd(path.dirname(__file__)):
|
||||
local('python setup.py sdist')
|
||||
|
||||
def clean():
|
||||
with lcd(path.dirname(__file__)):
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
cython<0.24
|
||||
cython>=0.24,<0.27.0
|
||||
pathlib
|
||||
numpy>=1.7
|
||||
cymem>=1.30,<1.32
|
||||
preshed>=1.0.0,<2.0.0
|
||||
thinc>=6.8.0,<6.9.0
|
||||
thinc>=6.9.0,<6.10.0
|
||||
murmurhash>=0.28,<0.29
|
||||
plac<1.0.0,>=0.9.6
|
||||
six
|
||||
|
@ -13,7 +13,7 @@ requests>=2.13.0,<3.0.0
|
|||
regex==2017.4.5
|
||||
ftfy>=4.4.2,<5.0.0
|
||||
pytest>=3.0.6,<4.0.0
|
||||
pip>=9.0.0,<10.0.0
|
||||
mock>=2.0.0,<3.0.0
|
||||
msgpack-python
|
||||
msgpack-numpy
|
||||
html5lib==1.0b8
|
||||
|
|
3
setup.py
3
setup.py
|
@ -195,9 +195,8 @@ def setup_package():
|
|||
'murmurhash>=0.28,<0.29',
|
||||
'cymem>=1.30,<1.32',
|
||||
'preshed>=1.0.0,<2.0.0',
|
||||
'thinc>=6.8.0,<6.9.0',
|
||||
'thinc>=6.9.0,<6.10.0',
|
||||
'plac<1.0.0,>=0.9.6',
|
||||
'pip>=9.0.0,<10.0.0',
|
||||
'six',
|
||||
'pathlib',
|
||||
'ujson>=1.35',
|
||||
|
|
|
@ -4,11 +4,13 @@ from __future__ import unicode_literals
|
|||
from .cli.info import info as cli_info
|
||||
from .glossary import explain
|
||||
from .deprecated import resolve_load_name
|
||||
#from .about import __version__
|
||||
from .about import __version__
|
||||
from . import util
|
||||
|
||||
|
||||
def load(name, **overrides):
|
||||
from .deprecated import resolve_load_name
|
||||
name = resolve_load_name(name, **overrides)
|
||||
return util.load_model(name, **overrides)
|
||||
|
||||
|
|
|
@ -7,7 +7,7 @@ if __name__ == '__main__':
|
|||
import plac
|
||||
import sys
|
||||
from spacy.cli import download, link, info, package, train, convert, model
|
||||
from spacy.cli import profile
|
||||
from spacy.cli import profile, evaluate
|
||||
from spacy.util import prints
|
||||
|
||||
commands = {
|
||||
|
@ -15,6 +15,7 @@ if __name__ == '__main__':
|
|||
'link': link,
|
||||
'info': info,
|
||||
'train': train,
|
||||
'evaluate': evaluate,
|
||||
'convert': convert,
|
||||
'package': package,
|
||||
'model': model,
|
||||
|
|
153
spacy/_ml.py
153
spacy/_ml.py
|
@ -1,28 +1,27 @@
|
|||
import ujson
|
||||
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
|
||||
from thinc.i2v import HashEmbed, StaticVectors
|
||||
from thinc.t2t import ExtractWindow, ParametricAttention
|
||||
from thinc.t2v import Pooling, max_pool, mean_pool, sum_pool
|
||||
from thinc.misc import Residual
|
||||
from thinc.misc import BatchNorm as BN
|
||||
from thinc.misc import LayerNorm as LN
|
||||
|
||||
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
|
||||
from thinc.neural import Model, Maxout, Softmax, Affine
|
||||
from thinc.neural._classes.hash_embed import HashEmbed
|
||||
from thinc.api import FeatureExtracter, with_getitem
|
||||
from thinc.api import uniqued, wrap, flatten_add_lengths, noop
|
||||
|
||||
from thinc.linear.linear import LinearModel
|
||||
from thinc.neural.ops import NumpyOps, CupyOps
|
||||
from thinc.neural.util import get_array_module
|
||||
|
||||
import random
|
||||
import cytoolz
|
||||
|
||||
from thinc.neural._classes.convolution import ExtractWindow
|
||||
from thinc.neural._classes.static_vectors import StaticVectors
|
||||
from thinc.neural._classes.batchnorm import BatchNorm as BN
|
||||
from thinc.neural._classes.layernorm import LayerNorm as LN
|
||||
from thinc.neural._classes.resnet import Residual
|
||||
from thinc.neural import ReLu
|
||||
from thinc.neural._classes.selu import SELU
|
||||
from thinc import describe
|
||||
from thinc.describe import Dimension, Synapses, Biases, Gradient
|
||||
from thinc.neural._classes.affine import _set_dimensions_if_needed
|
||||
from thinc.api import FeatureExtracter, with_getitem
|
||||
from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool
|
||||
from thinc.neural._classes.attention import ParametricAttention
|
||||
from thinc.linear.linear import LinearModel
|
||||
from thinc.api import uniqued, wrap, flatten_add_lengths
|
||||
|
||||
import thinc.extra.load_nlp
|
||||
|
||||
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP, CLUSTER
|
||||
from .tokens.doc import Doc
|
||||
|
@ -31,6 +30,11 @@ from . import util
|
|||
import numpy
|
||||
import io
|
||||
|
||||
# TODO: Unset this once we don't want to support models previous models.
|
||||
import thinc.neural._classes.layernorm
|
||||
thinc.neural._classes.layernorm.set_compat_six_eight(True)
|
||||
|
||||
VECTORS_KEY = 'spacy_pretrained_vectors'
|
||||
|
||||
@layerize
|
||||
def _flatten_add_lengths(seqs, pad=0, drop=0.):
|
||||
|
@ -225,33 +229,80 @@ def drop_layer(layer, factor=2.):
|
|||
model.predict = layer
|
||||
return model
|
||||
|
||||
def link_vectors_to_models(vocab):
|
||||
vectors = vocab.vectors
|
||||
ops = Model.ops
|
||||
for word in vocab:
|
||||
if word.orth in vectors.key2row:
|
||||
word.rank = vectors.key2row[word.orth]
|
||||
else:
|
||||
word.rank = 0
|
||||
data = ops.asarray(vectors.data)
|
||||
# Set an entry here, so that vectors are accessed by StaticVectors
|
||||
# (unideal, I know)
|
||||
thinc.extra.load_nlp.VECTORS[(ops.device, VECTORS_KEY)] = data
|
||||
|
||||
def Tok2Vec(width, embed_size, preprocess=None):
|
||||
def Tok2Vec(width, embed_size, **kwargs):
|
||||
pretrained_dims = kwargs.get('pretrained_dims', 0)
|
||||
cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 3)
|
||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
|
||||
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add,
|
||||
'*': reapply}):
|
||||
norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm')
|
||||
prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix')
|
||||
suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix')
|
||||
shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape')
|
||||
if pretrained_dims is not None and pretrained_dims >= 1:
|
||||
glove = StaticVectors(VECTORS_KEY, width, column=cols.index(ID))
|
||||
|
||||
embed = uniqued(
|
||||
(glove | norm | prefix | suffix | shape)
|
||||
>> LN(Maxout(width, width*5, pieces=3)), column=5)
|
||||
else:
|
||||
embed = uniqued(
|
||||
(norm | prefix | suffix | shape)
|
||||
>> LN(Maxout(width, width*4, pieces=3)), column=5)
|
||||
|
||||
|
||||
convolution = Residual(
|
||||
ExtractWindow(nW=1)
|
||||
>> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces))
|
||||
)
|
||||
|
||||
embed = (norm | prefix | suffix | shape ) >> LN(Maxout(width, width*4, pieces=3))
|
||||
tok2vec = (
|
||||
with_flatten(
|
||||
asarray(Model.ops, dtype='uint64')
|
||||
>> uniqued(embed, column=5)
|
||||
>> Residual(
|
||||
(ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
|
||||
) ** 4, pad=4
|
||||
FeatureExtracter(cols)
|
||||
>> with_flatten(
|
||||
embed >> (convolution ** 4), pad=4)
|
||||
)
|
||||
)
|
||||
if preprocess not in (False, None):
|
||||
tok2vec = preprocess >> tok2vec
|
||||
|
||||
# Work around thinc API limitations :(. TODO: Revise in Thinc 7
|
||||
tok2vec.nO = width
|
||||
tok2vec.embed = embed
|
||||
return tok2vec
|
||||
|
||||
|
||||
def reapply(layer, n_times):
|
||||
def reapply_fwd(X, drop=0.):
|
||||
backprops = []
|
||||
for i in range(n_times):
|
||||
Y, backprop = layer.begin_update(X, drop=drop)
|
||||
X = Y
|
||||
backprops.append(backprop)
|
||||
def reapply_bwd(dY, sgd=None):
|
||||
dX = None
|
||||
for backprop in reversed(backprops):
|
||||
dY = backprop(dY, sgd=sgd)
|
||||
if dX is None:
|
||||
dX = dY
|
||||
else:
|
||||
dX += dY
|
||||
return dX
|
||||
return Y, reapply_bwd
|
||||
return wrap(reapply_fwd, layer)
|
||||
|
||||
|
||||
|
||||
|
||||
def asarray(ops, dtype):
|
||||
def forward(X, drop=0.):
|
||||
return ops.asarray(X, dtype=dtype), None
|
||||
|
@ -455,20 +506,25 @@ def getitem(i):
|
|||
return X[i], None
|
||||
return layerize(getitem_fwd)
|
||||
|
||||
def build_tagger_model(nr_class, token_vector_width, **cfg):
|
||||
embed_size = util.env_opt('embed_size', 7500)
|
||||
def build_tagger_model(nr_class, **cfg):
|
||||
embed_size = util.env_opt('embed_size', 7000)
|
||||
if 'token_vector_width' in cfg:
|
||||
token_vector_width = cfg['token_vector_width']
|
||||
else:
|
||||
token_vector_width = util.env_opt('token_vector_width', 128)
|
||||
pretrained_dims = cfg.get('pretrained_dims', 0)
|
||||
with Model.define_operators({'>>': chain, '+': add}):
|
||||
# Input: (doc, tensor) tuples
|
||||
private_tok2vec = Tok2Vec(token_vector_width, embed_size, preprocess=doc2feats())
|
||||
|
||||
if 'tok2vec' in cfg:
|
||||
tok2vec = cfg['tok2vec']
|
||||
else:
|
||||
tok2vec = Tok2Vec(token_vector_width, embed_size,
|
||||
pretrained_dims=pretrained_dims)
|
||||
model = (
|
||||
fine_tune(private_tok2vec)
|
||||
>> with_flatten(
|
||||
Maxout(token_vector_width, token_vector_width)
|
||||
>> Softmax(nr_class, token_vector_width)
|
||||
)
|
||||
tok2vec
|
||||
>> with_flatten(Softmax(nr_class, token_vector_width))
|
||||
)
|
||||
model.nI = None
|
||||
model.tok2vec = tok2vec
|
||||
return model
|
||||
|
||||
|
||||
|
@ -514,6 +570,7 @@ def foreach(layer, drop_factor=1.0):
|
|||
|
||||
def build_text_classifier(nr_class, width=64, **cfg):
|
||||
nr_vector = cfg.get('nr_vector', 5000)
|
||||
pretrained_dims = cfg.get('pretrained_dims', 0)
|
||||
with Model.define_operators({'>>': chain, '+': add, '|': concatenate,
|
||||
'**': clone}):
|
||||
if cfg.get('low_data'):
|
||||
|
@ -521,7 +578,7 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
|||
SpacyVectors
|
||||
>> flatten_add_lengths
|
||||
>> with_getitem(0,
|
||||
Affine(width, 300)
|
||||
Affine(width, pretrained_dims)
|
||||
)
|
||||
>> ParametricAttention(width)
|
||||
>> Pooling(sum_pool)
|
||||
|
@ -548,18 +605,24 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
|||
)
|
||||
)
|
||||
|
||||
if pretrained_dims:
|
||||
static_vectors = (
|
||||
SpacyVectors
|
||||
>> with_flatten(Affine(width, 300))
|
||||
>> with_flatten(Affine(width, pretrained_dims))
|
||||
)
|
||||
|
||||
cnn_model = (
|
||||
# TODO Make concatenate support lists
|
||||
concatenate_lists(trained_vectors, static_vectors)
|
||||
vectors = concatenate_lists(trained_vectors, static_vectors)
|
||||
vectors_width = width*2
|
||||
else:
|
||||
vectors = trained_vectors
|
||||
vectors_width = width
|
||||
static_vectors = None
|
||||
cnn_model = (
|
||||
vectors
|
||||
>> with_flatten(
|
||||
LN(Maxout(width, width*2))
|
||||
LN(Maxout(width, vectors_width))
|
||||
>> Residual(
|
||||
(ExtractWindow(nW=1) >> zero_init(Maxout(width, width*3)))
|
||||
(ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
|
||||
) ** 2, pad=2
|
||||
)
|
||||
>> flatten_add_lengths
|
||||
|
@ -579,7 +642,7 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
|||
>> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0))
|
||||
>> logistic
|
||||
)
|
||||
|
||||
model.nO = nr_class
|
||||
model.lsuv = False
|
||||
return model
|
||||
|
||||
|
|
|
@ -3,14 +3,15 @@
|
|||
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
||||
|
||||
__title__ = 'spacy-nightly'
|
||||
__version__ = '2.0.0a13'
|
||||
__version__ = '2.0.0a16'
|
||||
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
|
||||
__uri__ = 'https://spacy.io'
|
||||
__author__ = 'Explosion AI'
|
||||
__email__ = 'contact@explosion.ai'
|
||||
__license__ = 'MIT'
|
||||
__release__ = True
|
||||
|
||||
__docs_models__ = 'https://spacy.io/docs/usage/models'
|
||||
__docs_models__ = 'https://alpha.spacy.io/usage/models'
|
||||
__download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
|
||||
__compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json'
|
||||
__shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts.json'
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# Reserve 64 values for flag features
|
||||
cpdef enum attr_id_t:
|
||||
cdef enum attr_id_t:
|
||||
NULL_ATTR
|
||||
IS_ALPHA
|
||||
IS_ASCII
|
||||
|
|
|
@ -94,6 +94,7 @@ IDS = {
|
|||
|
||||
# ATTR IDs, in order of the symbol
|
||||
NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
|
||||
locals().update(IDS)
|
||||
|
||||
|
||||
def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
||||
|
|
|
@ -4,5 +4,6 @@ from .link import link
|
|||
from .package import package
|
||||
from .profile import profile
|
||||
from .train import train
|
||||
from .evaluate import evaluate
|
||||
from .convert import convert
|
||||
from .model import model
|
||||
|
|
|
@ -14,7 +14,7 @@ from ..util import prints
|
|||
CONVERTERS = {
|
||||
'.conllu': conllu2json,
|
||||
'.conll': conllu2json,
|
||||
'.iob': iob2json
|
||||
'.iob': iob2json,
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
from cytoolz import partition_all, concat
|
||||
|
||||
from ...compat import json_dumps, path2str
|
||||
from ...util import prints
|
||||
|
@ -10,11 +11,9 @@ def iob2json(input_path, output_path, n_sents=10, *a, **k):
|
|||
"""
|
||||
Convert IOB files into JSON format for use with train cli.
|
||||
"""
|
||||
# TODO: This isn't complete yet -- need to map from IOB to
|
||||
# BILUO
|
||||
with input_path.open('r', encoding='utf8') as file_:
|
||||
docs = read_iob(file_)
|
||||
|
||||
sentences = read_iob(file_)
|
||||
docs = merge_sentences(sentences, n_sents)
|
||||
output_filename = input_path.parts[-1].replace(".iob", ".json")
|
||||
output_file = output_path / output_filename
|
||||
with output_file.open('w', encoding='utf-8') as f:
|
||||
|
@ -23,9 +22,9 @@ def iob2json(input_path, output_path, n_sents=10, *a, **k):
|
|||
title="Generated output file %s" % path2str(output_file))
|
||||
|
||||
|
||||
def read_iob(file_):
|
||||
def read_iob(raw_sents):
|
||||
sentences = []
|
||||
for line in file_:
|
||||
for line in raw_sents:
|
||||
if not line.strip():
|
||||
continue
|
||||
tokens = [t.split('|') for t in line.split()]
|
||||
|
@ -43,3 +42,15 @@ def read_iob(file_):
|
|||
paragraphs = [{'sentences': [sent]} for sent in sentences]
|
||||
docs = [{'id': 0, 'paragraphs': [para]} for para in paragraphs]
|
||||
return docs
|
||||
|
||||
def merge_sentences(docs, n_sents):
|
||||
counter = 0
|
||||
merged = []
|
||||
for group in partition_all(n_sents, docs):
|
||||
group = list(group)
|
||||
first = group.pop(0)
|
||||
to_extend = first['paragraphs'][0]['sentences']
|
||||
for sent in group[1:]:
|
||||
to_extend.extend(sent['paragraphs'][0]['sentences'])
|
||||
merged.append(first)
|
||||
return merged
|
||||
|
|
119
spacy/cli/evaluate.py
Normal file
119
spacy/cli/evaluate.py
Normal file
|
@ -0,0 +1,119 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals, division, print_function
|
||||
|
||||
import plac
|
||||
import json
|
||||
from collections import defaultdict
|
||||
import cytoolz
|
||||
from pathlib import Path
|
||||
import dill
|
||||
import tqdm
|
||||
from thinc.neural._classes.model import Model
|
||||
from thinc.neural.optimizers import linear_decay
|
||||
from timeit import default_timer as timer
|
||||
import random
|
||||
import numpy.random
|
||||
|
||||
from ..tokens.doc import Doc
|
||||
from ..scorer import Scorer
|
||||
from ..gold import GoldParse, merge_sents
|
||||
from ..gold import GoldCorpus, minibatch
|
||||
from ..util import prints
|
||||
from .. import util
|
||||
from .. import about
|
||||
from .. import displacy
|
||||
from ..compat import json_dumps
|
||||
|
||||
random.seed(0)
|
||||
numpy.random.seed(0)
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
model=("Model name or path", "positional", None, str),
|
||||
data_path=("Location of JSON-formatted evaluation data", "positional", None, str),
|
||||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||
gpu_id=("Use GPU", "option", "g", int),
|
||||
displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str),
|
||||
displacy_limit=("Limit of parses to render as HTML", "option", "dl", int)
|
||||
)
|
||||
def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
|
||||
displacy_path=None, displacy_limit=25):
|
||||
"""
|
||||
Evaluate a model. To render a sample of parses in a HTML file, set an output
|
||||
directory as the displacy_path argument.
|
||||
"""
|
||||
util.use_gpu(gpu_id)
|
||||
util.set_env_log(False)
|
||||
data_path = util.ensure_path(data_path)
|
||||
displacy_path = util.ensure_path(displacy_path)
|
||||
if not data_path.exists():
|
||||
prints(data_path, title="Evaluation data not found", exits=1)
|
||||
if displacy_path and not displacy_path.exists():
|
||||
prints(displacy_path, title="Visualization output directory not found", exits=1)
|
||||
corpus = GoldCorpus(data_path, data_path)
|
||||
nlp = util.load_model(model)
|
||||
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
|
||||
begin = timer()
|
||||
scorer = nlp.evaluate(dev_docs, verbose=False)
|
||||
end = timer()
|
||||
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
|
||||
print_results(scorer, time=end - begin, words=nwords,
|
||||
wps=nwords / (end - begin))
|
||||
if displacy_path:
|
||||
docs, golds = zip(*dev_docs)
|
||||
render_deps = 'parser' in nlp.meta.get('pipeline', [])
|
||||
render_ents = 'ner' in nlp.meta.get('pipeline', [])
|
||||
render_parses(docs, displacy_path, model_name=model, limit=displacy_limit,
|
||||
deps=render_deps, ents=render_ents)
|
||||
prints(displacy_path, title="Generated %s parses as HTML" % displacy_limit)
|
||||
|
||||
|
||||
def render_parses(docs, output_path, model_name='', limit=250, deps=True, ents=True):
|
||||
docs[0].user_data['title'] = model_name
|
||||
if ents:
|
||||
with (output_path / 'entities.html').open('w') as file_:
|
||||
html = displacy.render(docs[:limit], style='ent', page=True)
|
||||
file_.write(html)
|
||||
if deps:
|
||||
with (output_path / 'parses.html').open('w') as file_:
|
||||
html = displacy.render(docs[:limit], style='dep', page=True, options={'compact': True})
|
||||
file_.write(html)
|
||||
|
||||
|
||||
def print_progress(itn, losses, dev_scores, wps=0.0):
|
||||
scores = {}
|
||||
for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
|
||||
'ents_p', 'ents_r', 'ents_f', 'wps']:
|
||||
scores[col] = 0.0
|
||||
scores['dep_loss'] = losses.get('parser', 0.0)
|
||||
scores['ner_loss'] = losses.get('ner', 0.0)
|
||||
scores['tag_loss'] = losses.get('tagger', 0.0)
|
||||
scores.update(dev_scores)
|
||||
scores['wps'] = wps
|
||||
tpl = '\t'.join((
|
||||
'{:d}',
|
||||
'{dep_loss:.3f}',
|
||||
'{ner_loss:.3f}',
|
||||
'{uas:.3f}',
|
||||
'{ents_p:.3f}',
|
||||
'{ents_r:.3f}',
|
||||
'{ents_f:.3f}',
|
||||
'{tags_acc:.3f}',
|
||||
'{token_acc:.3f}',
|
||||
'{wps:.1f}'))
|
||||
print(tpl.format(itn, **scores))
|
||||
|
||||
|
||||
def print_results(scorer, time, words, wps):
|
||||
results = {
|
||||
'Time': '%.2f s' % time,
|
||||
'Words': words,
|
||||
'Words/s': '%.0f' % wps,
|
||||
'TOK': '%.2f' % scorer.token_acc,
|
||||
'POS': '%.2f' % scorer.tags_acc,
|
||||
'UAS': '%.2f' % scorer.uas,
|
||||
'LAS': '%.2f' % scorer.las,
|
||||
'NER P': '%.2f' % scorer.ents_p,
|
||||
'NER R': '%.2f' % scorer.ents_r,
|
||||
'NER F': '%.2f' % scorer.ents_f}
|
||||
util.print_table(results, title="Results")
|
|
@ -105,8 +105,11 @@ def generate_pipeline():
|
|||
"parser, ner. For more information, see the docs on processing pipelines.",
|
||||
title="Enter your model's pipeline components")
|
||||
pipeline = util.get_raw_input("Pipeline components", True)
|
||||
replace = {'True': True, 'False': False}
|
||||
return replace[pipeline] if pipeline in replace else pipeline.split(', ')
|
||||
subs = {'True': True, 'False': False}
|
||||
if pipeline in subs:
|
||||
return subs[pipeline]
|
||||
else:
|
||||
return [p.strip() for p in pipeline.split(',')]
|
||||
|
||||
|
||||
def validate_meta(meta, keys):
|
||||
|
|
|
@ -8,8 +8,11 @@ import cytoolz
|
|||
from pathlib import Path
|
||||
import dill
|
||||
import tqdm
|
||||
from thinc.neural._classes.model import Model
|
||||
from thinc.neural.optimizers import linear_decay
|
||||
from timeit import default_timer as timer
|
||||
import random
|
||||
import numpy.random
|
||||
|
||||
from ..tokens.doc import Doc
|
||||
from ..scorer import Scorer
|
||||
|
@ -17,9 +20,13 @@ from ..gold import GoldParse, merge_sents
|
|||
from ..gold import GoldCorpus, minibatch
|
||||
from ..util import prints
|
||||
from .. import util
|
||||
from .. import about
|
||||
from .. import displacy
|
||||
from ..compat import json_dumps
|
||||
|
||||
random.seed(0)
|
||||
numpy.random.seed(0)
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
lang=("model language", "positional", None, str),
|
||||
|
@ -29,15 +36,17 @@ from ..compat import json_dumps
|
|||
n_iter=("number of iterations", "option", "n", int),
|
||||
n_sents=("number of sentences", "option", "ns", int),
|
||||
use_gpu=("Use GPU", "option", "g", int),
|
||||
resume=("Whether to resume training", "flag", "R", bool),
|
||||
vectors=("Model to load vectors from", "option", "v"),
|
||||
no_tagger=("Don't train tagger", "flag", "T", bool),
|
||||
no_parser=("Don't train parser", "flag", "P", bool),
|
||||
no_entities=("Don't train NER", "flag", "N", bool),
|
||||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||
version=("Model version", "option", "V", str),
|
||||
meta_path=("Optional path to meta.json. All relevant properties will be overwritten.", "option", "m", Path)
|
||||
)
|
||||
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||
use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False,
|
||||
gold_preproc=False):
|
||||
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
|
||||
use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False,
|
||||
gold_preproc=False, version="0.0.0", meta_path=None):
|
||||
"""
|
||||
Train a model. Expects data in spaCy's JSON format.
|
||||
"""
|
||||
|
@ -46,19 +55,24 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
|||
output_path = util.ensure_path(output_dir)
|
||||
train_path = util.ensure_path(train_data)
|
||||
dev_path = util.ensure_path(dev_data)
|
||||
meta_path = util.ensure_path(meta_path)
|
||||
if not output_path.exists():
|
||||
output_path.mkdir()
|
||||
if not train_path.exists():
|
||||
prints(train_path, title="Training data not found", exits=1)
|
||||
if dev_path and not dev_path.exists():
|
||||
prints(dev_path, title="Development data not found", exits=1)
|
||||
if meta_path is not None and not meta_path.exists():
|
||||
prints(meta_path, title="meta.json not found", exits=1)
|
||||
meta = util.read_json(meta_path) if meta_path else {}
|
||||
if not isinstance(meta, dict):
|
||||
prints("Expected dict but got: {}".format(type(meta)),
|
||||
title="Not a valid meta.json format", exits=1)
|
||||
|
||||
lang_class = util.get_lang_class(lang)
|
||||
|
||||
pipeline = ['token_vectors', 'tags', 'dependencies', 'entities']
|
||||
if no_tagger and 'tags' in pipeline: pipeline.remove('tags')
|
||||
if no_parser and 'dependencies' in pipeline: pipeline.remove('dependencies')
|
||||
if no_entities and 'entities' in pipeline: pipeline.remove('entities')
|
||||
pipeline = ['tagger', 'parser', 'ner']
|
||||
if no_tagger and 'tagger' in pipeline: pipeline.remove('tagger')
|
||||
if no_parser and 'parser' in pipeline: pipeline.remove('parser')
|
||||
if no_entities and 'ner' in pipeline: pipeline.remove('ner')
|
||||
|
||||
# Take dropout and batch size as generators of values -- dropout
|
||||
# starts high and decays sharply, to force the optimizer to explore.
|
||||
|
@ -68,33 +82,30 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
|||
util.env_opt('dropout_to', 0.2),
|
||||
util.env_opt('dropout_decay', 0.0))
|
||||
batch_sizes = util.compounding(util.env_opt('batch_from', 1),
|
||||
util.env_opt('batch_to', 64),
|
||||
util.env_opt('batch_to', 16),
|
||||
util.env_opt('batch_compound', 1.001))
|
||||
|
||||
if resume:
|
||||
prints(output_path / 'model9.pickle', title="Resuming training")
|
||||
nlp = dill.load((output_path / 'model9.pickle').open('rb'))
|
||||
else:
|
||||
nlp = lang_class(pipeline=pipeline)
|
||||
corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
|
||||
n_train_words = corpus.count_train()
|
||||
|
||||
lang_class = util.get_lang_class(lang)
|
||||
nlp = lang_class(pipeline=pipeline)
|
||||
if vectors:
|
||||
util.load_model(vectors, vocab=nlp.vocab)
|
||||
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
|
||||
nlp._optimizer = None
|
||||
|
||||
print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
|
||||
print("Itn.\tP.Loss\tN.Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
|
||||
try:
|
||||
for i in range(n_iter):
|
||||
if resume:
|
||||
i += 20
|
||||
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
||||
train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0,
|
||||
gold_preproc=gold_preproc, max_length=0)
|
||||
train_docs = list(train_docs)
|
||||
for i in range(n_iter):
|
||||
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
||||
losses = {}
|
||||
for batch in minibatch(train_docs, size=batch_sizes):
|
||||
docs, golds = zip(*batch)
|
||||
nlp.update(docs, golds, sgd=optimizer,
|
||||
drop=next(dropout_rates), losses=losses,
|
||||
update_shared=True)
|
||||
drop=next(dropout_rates), losses=losses)
|
||||
pbar.update(sum(len(doc) for doc in docs))
|
||||
|
||||
with nlp.use_params(optimizer.averages):
|
||||
|
@ -104,12 +115,22 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
|||
nlp_loaded = lang_class(pipeline=pipeline)
|
||||
nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
|
||||
scorer = nlp_loaded.evaluate(
|
||||
corpus.dev_docs(
|
||||
list(corpus.dev_docs(
|
||||
nlp_loaded,
|
||||
gold_preproc=gold_preproc))
|
||||
gold_preproc=gold_preproc)))
|
||||
acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
|
||||
with acc_loc.open('w') as file_:
|
||||
file_.write(json_dumps(scorer.scores))
|
||||
meta_loc = output_path / ('model%d' % i) / 'meta.json'
|
||||
meta['accuracy'] = scorer.scores
|
||||
meta['lang'] = nlp.lang
|
||||
meta['pipeline'] = pipeline
|
||||
meta['spacy_version'] = '>=%s' % about.__version__
|
||||
meta.setdefault('name', 'model%d' % i)
|
||||
meta.setdefault('version', version)
|
||||
|
||||
with meta_loc.open('w') as file_:
|
||||
file_.write(json_dumps(meta))
|
||||
util.set_env_log(True)
|
||||
print_progress(i, losses, scorer.scores)
|
||||
finally:
|
||||
|
@ -138,12 +159,14 @@ def print_progress(itn, losses, dev_scores, wps=0.0):
|
|||
'ents_p', 'ents_r', 'ents_f', 'wps']:
|
||||
scores[col] = 0.0
|
||||
scores['dep_loss'] = losses.get('parser', 0.0)
|
||||
scores['ner_loss'] = losses.get('ner', 0.0)
|
||||
scores['tag_loss'] = losses.get('tagger', 0.0)
|
||||
scores.update(dev_scores)
|
||||
scores['wps'] = wps
|
||||
tpl = '\t'.join((
|
||||
'{:d}',
|
||||
'{dep_loss:.3f}',
|
||||
'{ner_loss:.3f}',
|
||||
'{uas:.3f}',
|
||||
'{ents_p:.3f}',
|
||||
'{ents_r:.3f}',
|
||||
|
|
|
@ -7,6 +7,7 @@ import re
|
|||
import ujson
|
||||
import random
|
||||
import cytoolz
|
||||
import itertools
|
||||
|
||||
from .syntax import nonproj
|
||||
from .util import ensure_path
|
||||
|
@ -146,9 +147,13 @@ def minibatch(items, size=8):
|
|||
'''Iterate over batches of items. `size` may be an iterator,
|
||||
so that batch-size can vary on each step.
|
||||
'''
|
||||
if isinstance(size, int):
|
||||
size_ = itertools.repeat(8)
|
||||
else:
|
||||
size_ = size
|
||||
items = iter(items)
|
||||
while True:
|
||||
batch_size = next(size) #if hasattr(size, '__next__') else size
|
||||
batch_size = next(size_)
|
||||
batch = list(cytoolz.take(int(batch_size), items))
|
||||
if len(batch) == 0:
|
||||
break
|
||||
|
|
|
@ -29,9 +29,9 @@ _units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm
|
|||
'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb '
|
||||
'TB T G M K %')
|
||||
_currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
|
||||
_punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &'
|
||||
_punct = r'… …… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ ·'
|
||||
_quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «'
|
||||
_hyphens = '- – — -- ---'
|
||||
_hyphens = '- – — -- --- —— ~'
|
||||
_other_symbols = r'[\p{So}]'
|
||||
|
||||
UNITS = merge_chars(_units)
|
||||
|
|
|
@ -3,6 +3,7 @@ from __future__ import unicode_literals
|
|||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .norm_exceptions import NORM_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
from .tag_map import TAG_MAP
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lemmatizer import LOOKUP
|
||||
|
@ -23,6 +24,7 @@ class GermanDefaults(Language.Defaults):
|
|||
NORM_EXCEPTIONS, BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
infixes = tuple(TOKENIZER_INFIXES)
|
||||
tag_map = dict(TAG_MAP)
|
||||
stop_words = set(STOP_WORDS)
|
||||
syntax_iterators = dict(SYNTAX_ITERATORS)
|
||||
|
|
20
spacy/lang/de/punctuation.py
Normal file
20
spacy/lang/de/punctuation.py
Normal file
|
@ -0,0 +1,20 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
|
||||
from ..char_classes import QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||
|
||||
|
||||
_quotes = QUOTES.replace("'", '')
|
||||
|
||||
_infixes = (LIST_ELLIPSES + LIST_ICONS +
|
||||
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
||||
r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}])([{q}\)\]\(\[])(?=[\{a}])'.format(a=ALPHA, q=_quotes),
|
||||
r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[0-9])-(?=[0-9])'])
|
||||
|
||||
|
||||
TOKENIZER_INFIXES = _infixes
|
|
@ -4,6 +4,7 @@ from __future__ import unicode_literals
|
|||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
||||
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .lemmatizer import LOOKUP
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
||||
|
@ -17,6 +18,7 @@ from ...util import update_exc, add_lookups
|
|||
|
||||
class FrenchDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: 'fr'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
|
|
41
spacy/lang/fr/lex_attrs.py
Normal file
41
spacy/lang/fr/lex_attrs.py
Normal file
|
@ -0,0 +1,41 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...attrs import LIKE_NUM
|
||||
|
||||
|
||||
_num_words = set("""
|
||||
zero un deux trois quatre cinq six sept huit neuf dix
|
||||
onze douze treize quatorze quinze seize dix-sept dix-huit dix-neuf
|
||||
vingt trente quanrante cinquante soixante septante quatre-vingt huitante nonante
|
||||
cent mille mil million milliard billion quadrillion quintillion
|
||||
sextillion septillion octillion nonillion decillion
|
||||
""".split())
|
||||
|
||||
_ordinal_words = set("""
|
||||
premier deuxième second troisième quatrième cinquième sixième septième huitième neuvième dixième
|
||||
onzième douzième treizième quatorzième quinzième seizième dix-septième dix-huitième dix-neufième
|
||||
vingtième trentième quanrantième cinquantième soixantième septantième quatre-vingtième huitantième nonantième
|
||||
centième millième millionnième milliardième billionnième quadrillionnième quintillionnième
|
||||
sextillionnième septillionnième octillionnième nonillionnième decillionnième
|
||||
""".split())
|
||||
|
||||
|
||||
def like_num(text):
|
||||
# Might require more work?
|
||||
# See this discussion: https://github.com/explosion/spaCy/pull/1161
|
||||
text = text.replace(',', '').replace('.', '')
|
||||
if text.isdigit():
|
||||
return True
|
||||
if text.count('/') == 1:
|
||||
num, denom = text.split('/')
|
||||
if num.isdigit() and denom.isdigit():
|
||||
return True
|
||||
if text in _num_words:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
LEX_ATTRS = {
|
||||
LIKE_NUM: like_num
|
||||
}
|
|
@ -2,6 +2,7 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
|
@ -12,6 +13,7 @@ from ...util import update_exc, add_lookups
|
|||
|
||||
class DutchDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: 'nl'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
|
|
40
spacy/lang/nl/lex_attrs.py
Normal file
40
spacy/lang/nl/lex_attrs.py
Normal file
|
@ -0,0 +1,40 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...attrs import LIKE_NUM
|
||||
|
||||
|
||||
_num_words = set("""
|
||||
nul een één twee drie vier vijf zes zeven acht negen tien elf twaalf dertien
|
||||
veertien twintig dertig veertig vijftig zestig zeventig tachtig negentig honderd
|
||||
duizend miljoen miljard biljoen biljard triljoen triljard
|
||||
""".split())
|
||||
|
||||
_ordinal_words = set("""
|
||||
eerste tweede derde vierde vijfde zesde zevende achtste negende tiende elfde
|
||||
twaalfde dertiende veertiende twintigste dertigste veertigste vijftigste
|
||||
zestigste zeventigste tachtigste negentigste honderdste duizendste miljoenste
|
||||
miljardste biljoenste biljardste triljoenste triljardste
|
||||
""".split())
|
||||
|
||||
|
||||
def like_num(text):
|
||||
# This only does the most basic check for whether a token is a digit
|
||||
# or matches one of the number words. In order to handle numbers like
|
||||
# "drieëntwintig", more work is required.
|
||||
# See this discussion: https://github.com/explosion/spaCy/pull/1177
|
||||
text = text.replace(',', '').replace('.', '')
|
||||
if text.isdigit():
|
||||
return True
|
||||
if text.count('/') == 1:
|
||||
num, denom = text.split('/')
|
||||
if num.isdigit() and denom.isdigit():
|
||||
return True
|
||||
if text in _num_words:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
LEX_ATTRS = {
|
||||
LIKE_NUM: like_num
|
||||
}
|
35
spacy/lang/th/__init__.py
Normal file
35
spacy/lang/th/__init__.py
Normal file
|
@ -0,0 +1,35 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .tag_map import TAG_MAP
|
||||
from .stop_words import STOP_WORDS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...tokens import Doc
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
class ThaiDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'th'
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
tag_map = dict(TAG_MAP)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
||||
|
||||
class Thai(Language):
|
||||
lang = 'th'
|
||||
Defaults = ThaiDefaults
|
||||
def make_doc(self, text):
|
||||
try:
|
||||
from pythainlp.tokenize import word_tokenize
|
||||
except ImportError:
|
||||
raise ImportError("The Thai tokenizer requires the PyThaiNLP library: "
|
||||
"https://github.com/wannaphongcom/pythainlp/")
|
||||
words = [x for x in list(word_tokenize(text,"newmm"))]
|
||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||
|
||||
__all__ = ['Thai']
|
62
spacy/lang/th/stop_words.py
Normal file
62
spacy/lang/th/stop_words.py
Normal file
|
@ -0,0 +1,62 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
# data from https://github.com/wannaphongcom/pythainlp/blob/dev/pythainlp/corpus/stopwords-th.txt
|
||||
# stop words as whitespace-separated list
|
||||
STOP_WORDS = set("""
|
||||
นี้ นํา นั้น นัก นอกจาก ทุก ที่สุด ที่ ทําให้ ทํา ทาง ทั้งนี้ ดัง ซึ่ง ช่วง จาก จัด จะ คือ ความ ครั้ง คง ขึ้น ของ
|
||||
ขอ รับ ระหว่าง รวม ยัง มี มาก มา พร้อม พบ ผ่าน ผล บาง น่า เปิดเผย เปิด เนื่องจาก เดียวกัน เดียว เช่น เฉพาะ เข้า ถ้า
|
||||
ถูก ถึง ต้อง ต่างๆ ต่าง ต่อ ตาม ตั้งแต่ ตั้ง ด้าน ด้วย อีก อาจ ออก อย่าง อะไร อยู่ อยาก หาก หลาย หลังจาก แต่ เอง เห็น
|
||||
เลย เริ่ม เรา เมื่อ เพื่อ เพราะ เป็นการ เป็น หลัง หรือ หนึ่ง ส่วน ส่ง สุด สําหรับ ว่า ลง ร่วม ราย ขณะ ก่อน ก็ การ กับ กัน
|
||||
กว่า กล่าว จึง ไว้ ไป ได้ ให้ ใน โดย แห่ง แล้ว และ แรก แบบ ๆ ทั้ง วัน เขา เคย ไม่ อยาก เกิน เกินๆ เกี่ยวกัน เกี่ยวกับ
|
||||
เกี่ยวข้อง เกี่ยวเนื่อง เกี่ยวๆ เกือบ เกือบจะ เกือบๆ แก แก่ แก้ไข ใกล้ ใกล้ๆ ไกล ไกลๆ ขณะเดียวกัน ขณะใด ขณะใดๆ ขณะที่ ขณะนั้น ขณะนี้ ขณะหนึ่ง ขวาง
|
||||
ขวางๆ ขั้น ใคร ใคร่ ใคร่จะ ใครๆ ง่าย ง่ายๆ ไง จง จด จน จนกระทั่ง จนกว่า จนขณะนี้ จนตลอด จนถึง จนทั่ว จนบัดนี้ จนเมื่อ จนแม้ จนแม้น
|
||||
จรด จรดกับ จริง จริงจัง จริงๆ จริงๆจังๆ จวน จวนจะ จวนเจียน จวบ ซึ่งก็ ซึ่งก็คือ ซึ่งกัน ซึ่งกันและกัน ซึ่งได้แก่ ซึ่งๆ ณ ด้วย ด้วยกัน ด้วยเช่นกัน ด้วยที่ ด้วยประการฉะนี้
|
||||
ด้วยเพราะ ด้วยว่า ด้วยเหตุที่ ด้วยเหตุนั้น ด้วยเหตุนี้ ด้วยเหตุเพราะ ด้วยเหตุว่า ด้วยเหมือนกัน ดั่ง ดังกล่าว ดังกับ ดั่งกับ ดังกับว่า ดั่งกับว่า ดังเก่า
|
||||
ดั่งเก่า ดังเคย ใดๆ ได้ ได้แก่ ได้แต่ ได้ที่ ได้มา ได้รับ ตน ตนเอง ตนฯ ตรง ตรงๆ ตลอด ตลอดกาล ตลอดกาลนาน ตลอดจน ตลอดถึง ตลอดทั้ง
|
||||
ตลอดทั่ว ตลอดทั่วถึง ตลอดทั่วทั้ง ตลอดปี ตลอดไป ตลอดมา ตลอดระยะเวลา ตลอดวัน ตลอดเวลา ตลอดศก ต่อ ต่อกัน ถึงแก่ ถึงจะ ถึงบัดนั้น ถึงบัดนี้
|
||||
ถึงเมื่อ ถึงเมื่อใด ถึงเมื่อไร ถึงแม้ ถึงแม้จะ ถึงแม้ว่า ถึงอย่างไร ถือ ถือว่า ถูกต้อง ถูกๆ เถอะ เถิด ทรง ทว่า ทั้งคน ทั้งตัว ทั้งที ทั้งที่ ทั้งนั้น ทั้งนั้นด้วย ทั้งนั้นเพราะ
|
||||
นอก นอกจากที่ นอกจากนั้น นอกจากนี้ นอกจากว่า นอกนั้น นอกเหนือ นอกเหนือจาก น้อย น้อยกว่า น้อยๆ นะ น่ะ นักๆ นั่น นั่นไง นั่นเป็น นั่นแหละ
|
||||
นั่นเอง นั้นๆ นับ นับจากนั้น นับจากนี้ นับตั้งแต่ นับแต่ นับแต่ที่ นับแต่นั้น เป็นต้น เป็นต้นไป เป็นต้นมา เป็นแต่ เป็นแต่เพียง เป็นที เป็นที่ เป็นที่สุด เป็นเพราะ
|
||||
เป็นเพราะว่า เป็นเพียง เป็นเพียงว่า เป็นเพื่อ เป็นอัน เป็นอันมาก เป็นอันว่า เป็นอันๆ เป็นอาทิ เป็นๆ เปลี่ยน เปลี่ยนแปลง เปิด เปิดเผย ไป่ ผ่าน ผ่านๆ
|
||||
ผิด ผิดๆ ผู้ เพียงเพื่อ เพียงไร เพียงไหน เพื่อที่ เพื่อที่จะ เพื่อว่า เพื่อให้ ภาค ภาคฯ ภาย ภายใต้ ภายนอก ภายใน ภายภาค ภายภาคหน้า ภายหน้า ภายหลัง
|
||||
มอง มองว่า มัก มักจะ มัน มันๆ มั้ย มั้ยนะ มั้ยนั่น มั้ยเนี่ย มั้ยล่ะ ยืนนาน ยืนยง ยืนยัน ยืนยาว เยอะ เยอะแยะ เยอะๆ แยะ แยะๆ รวด รวดเร็ว ร่วม รวมกัน ร่วมกัน
|
||||
รวมด้วย ร่วมด้วย รวมถึง รวมทั้ง ร่วมมือ รวมๆ ระยะ ระยะๆ ระหว่าง รับรอง รึ รึว่า รือ รือว่า สิ้นกาลนาน สืบเนื่อง สุดๆ สู่ สูง สูงกว่า สูงส่ง สูงสุด สูงๆ เสมือนกับ
|
||||
เสมือนว่า เสร็จ เสร็จกัน เสร็จแล้ว เสร็จสมบูรณ์ เสร็จสิ้น เสีย เสียก่อน เสียจน เสียจนกระทั่ง เสียจนถึง เสียด้วย เสียนั่น เสียนั่นเอง เสียนี่ เสียนี่กระไร เสียยิ่ง
|
||||
เสียยิ่งนัก เสียแล้ว ใหญ่ๆ ให้ดี ให้แด่ ให้ไป ใหม่ ให้มา ใหม่ๆ ไหน ไหนๆ อดีต อนึ่ง อย่าง อย่างเช่น อย่างดี อย่างเดียว อย่างใด อย่างที่ อย่างน้อย อย่างนั้น
|
||||
อย่างนี้ อย่างโน้น ก็คือ ก็แค่ ก็จะ ก็ดี ก็ได้ ก็ต่อเมื่อ ก็ตาม ก็ตามแต่ ก็ตามที ก็แล้วแต่ กระทั่ง กระทำ กระนั้น กระผม กลับ กล่าวคือ กลุ่ม กลุ่มก้อน
|
||||
กลุ่มๆ กว้าง กว้างขวาง กว้างๆ ก่อนหน้า ก่อนหน้านี้ ก่อนๆ กันดีกว่า กันดีไหม กันเถอะ กันนะ กันและกัน กันไหม กันเอง กำลัง กำลังจะ กำหนด กู เก็บ
|
||||
เกิด เกี่ยวข้อง แก่ แก้ไข ใกล้ ใกล้ๆ ข้า ข้าง ข้างเคียง ข้างต้น ข้างบน ข้างล่าง ข้างๆ ขาด ข้าพเจ้า ข้าฯ เข้าใจ เขียน คงจะ คงอยู่ ครบ ครบครัน ครบถ้วน
|
||||
ครั้งกระนั้น ครั้งก่อน ครั้งครา ครั้งคราว ครั้งใด ครั้งที่ ครั้งนั้น ครั้งนี้ ครั้งละ ครั้งหนึ่ง ครั้งหลัง ครั้งหลังสุด ครั้งไหน ครั้งๆ ครัน ครับ ครา คราใด คราที่ ครานั้น ครานี้ คราหนึ่ง
|
||||
คราไหน คราว คราวก่อน คราวใด คราวที่ คราวนั้น คราวนี้ คราวโน้น คราวละ คราวหน้า คราวหนึ่ง คราวหลัง คราวไหน คราวๆ คล้าย คล้ายกัน คล้ายกันกับ
|
||||
คล้ายกับ คล้ายกับว่า คล้ายว่า ควร ค่อน ค่อนข้าง ค่อนข้างจะ ค่อยไปทาง ค่อนมาทาง ค่อย ค่อยๆ คะ ค่ะ คำ คิด คิดว่า คุณ คุณๆ
|
||||
เคยๆ แค่ แค่จะ แค่นั้น แค่นี้ แค่เพียง แค่ว่า แค่ไหน ใคร่ ใคร่จะ ง่าย ง่ายๆ จนกว่า จนแม้ จนแม้น จังๆ จวบกับ จวบจน จ้ะ จ๊ะ จะได้ จัง จัดการ จัดงาน จัดแจง
|
||||
จัดตั้ง จัดทำ จัดหา จัดให้ จับ จ้า จ๋า จากนั้น จากนี้ จากนี้ไป จำ จำเป็น จำพวก จึงจะ จึงเป็น จู่ๆ ฉะนั้น ฉะนี้ ฉัน เฉกเช่น เฉย เฉยๆ ไฉน ช่วงก่อน
|
||||
ช่วงต่อไป ช่วงถัดไป ช่วงท้าย ช่วงที่ ช่วงนั้น ช่วงนี้ ช่วงระหว่าง ช่วงแรก ช่วงหน้า ช่วงหลัง ช่วงๆ ช่วย ช้า ช้านาน ชาว ช้าๆ เช่นก่อน เช่นกัน เช่นเคย
|
||||
เช่นดัง เช่นดังก่อน เช่นดังเก่า เช่นดังที่ เช่นดังว่า เช่นเดียวกัน เช่นเดียวกับ เช่นใด เช่นที่ เช่นที่เคย เช่นที่ว่า เช่นนั้น เช่นนั้นเอง เช่นนี้ เช่นเมื่อ เช่นไร เชื่อ
|
||||
เชื่อถือ เชื่อมั่น เชื่อว่า ใช่ ใช่ไหม ใช้ ซะ ซะก่อน ซะจน ซะจนกระทั่ง ซะจนถึง ซึ่งได้แก่ ด้วยกัน ด้วยเช่นกัน ด้วยที่ ด้วยเพราะ ด้วยว่า ด้วยเหตุที่ ด้วยเหตุนั้น
|
||||
ด้วยเหตุนี้ ด้วยเหตุเพราะ ด้วยเหตุว่า ด้วยเหมือนกัน ดังกล่าว ดังกับว่า ดั่งกับว่า ดังเก่า ดั่งเก่า ดั่งเคย ต่างก็ ต่างหาก ตามด้วย ตามแต่ ตามที่
|
||||
ตามๆ เต็มไปด้วย เต็มไปหมด เต็มๆ แต่ก็ แต่ก่อน แต่จะ แต่เดิม แต่ต้อง แต่ถ้า แต่ทว่า แต่ที่ แต่นั้น แต่เพียง แต่เมื่อ แต่ไร แต่ละ แต่ว่า แต่ไหน แต่อย่างใด โต
|
||||
โตๆ ใต้ ถ้าจะ ถ้าหาก ถึงแก่ ถึงแม้ ถึงแม้จะ ถึงแม้ว่า ถึงอย่างไร ถือว่า ถูกต้อง ทว่า ทั้งนั้นด้วย ทั้งปวง ทั้งเป็น ทั้งมวล ทั้งสิ้น ทั้งหมด ทั้งหลาย ทั้งๆ ทัน
|
||||
ทันใดนั้น ทันที ทันทีทันใด ทั่ว ทำไม ทำไร ทำให้ ทำๆ ที ที่จริง ที่ซึ่ง ทีเดียว ทีใด ที่ใด ที่ได้ ทีเถอะ ที่แท้ ที่แท้จริง ที่นั้น ที่นี้ ทีไร ทีละ ที่ละ
|
||||
ที่แล้ว ที่ว่า ที่แห่งนั้น ที่ไหน ทีๆ ที่ๆ ทุกคน ทุกครั้ง ทุกครา ทุกคราว ทุกชิ้น ทุกตัว ทุกทาง ทุกที ทุกที่ ทุกเมื่อ ทุกวัน ทุกวันนี้ ทุกสิ่ง ทุกหน ทุกแห่ง ทุกอย่าง
|
||||
ทุกอัน ทุกๆ เท่า เท่ากัน เท่ากับ เท่าใด เท่าที่ เท่านั้น เท่านี้ เท่าไร เท่าไหร่ แท้ แท้จริง เธอ นอกจากว่า น้อย น้อยกว่า น้อยๆ น่ะ นั้นไว นับแต่นี้ นาง
|
||||
นางสาว น่าจะ นาน นานๆ นาย นำ นำพา นำมา นิด นิดหน่อย นิดๆ นี่ นี่ไง นี่นา นี่แน่ะ นี่แหละ นี้แหล่ นี่เอง นี้เอง นู่น นู้น เน้น เนี่ย
|
||||
เนี่ยเอง ในช่วง ในที่ ในเมื่อ ในระหว่าง บน บอก บอกแล้ว บอกว่า บ่อย บ่อยกว่า บ่อยครั้ง บ่อยๆ บัดดล บัดเดี๋ยวนี้ บัดนั้น บัดนี้ บ้าง บางกว่า
|
||||
บางขณะ บางครั้ง บางครา บางคราว บางที บางที่ บางแห่ง บางๆ ปฏิบัติ ประกอบ ประการ ประการฉะนี้ ประการใด ประการหนึ่ง ประมาณ ประสบ ปรับ
|
||||
ปรากฏ ปรากฏว่า ปัจจุบัน ปิด เป็นด้วย เป็นดัง เป็นต้น เป็นแต่ เป็นเพื่อ เป็นอัน เป็นอันมาก เป็นอาทิ ผ่านๆ ผู้ ผู้ใด เผื่อ เผื่อจะ เผื่อที่ เผื่อว่า ฝ่าย
|
||||
ฝ่ายใด พบว่า พยายาม พร้อมกัน พร้อมกับ พร้อมด้วย พร้อมทั้ง พร้อมที่ พร้อมเพียง พวก พวกกัน พวกกู พวกแก พวกเขา พวกคุณ พวกฉัน พวกท่าน
|
||||
พวกที่ พวกเธอ พวกนั้น พวกนี้ พวกนู้น พวกโน้น พวกมัน พวกมึง พอ พอกัน พอควร พอจะ พอดี พอตัว พอที พอที่ พอเพียง พอแล้ว พอสม พอสมควร
|
||||
พอเหมาะ พอๆ พา พึง พึ่ง พื้นๆ พูด เพราะฉะนั้น เพราะว่า เพิ่ง เพิ่งจะ เพิ่ม เพิ่มเติม เพียง เพียงแค่ เพียงใด เพียงแต่ เพียงพอ เพียงเพราะ
|
||||
เพื่อว่า เพื่อให้ ภายใต้ มองว่า มั๊ย มากกว่า มากมาย มิ มิฉะนั้น มิใช่ มิได้ มีแต่ มึง มุ่ง มุ่งเน้น มุ่งหมาย เมื่อก่อน เมื่อครั้ง เมื่อครั้งก่อน
|
||||
เมื่อคราวก่อน เมื่อคราวที่ เมื่อคราว เมื่อคืน เมื่อเช้า เมื่อใด เมื่อนั้น เมื่อนี้ เมื่อเย็น เมื่อไร เมื่อวันวาน เมื่อวาน เมื่อไหร่ แม้ แม้กระทั่ง แม้แต่ แม้นว่า แม้ว่า
|
||||
ไม่ค่อย ไม่ค่อยจะ ไม่ค่อยเป็น ไม่ใช่ ไม่เป็นไร ไม่ว่า ยก ยกให้ ยอม ยอมรับ ย่อม ย่อย ยังคง ยังงั้น ยังงี้ ยังโง้น ยังไง ยังจะ ยังแต่ ยาก
|
||||
ยาว ยาวนาน ยิ่ง ยิ่งกว่า ยิ่งขึ้น ยิ่งขึ้นไป ยิ่งจน ยิ่งจะ ยิ่งนัก ยิ่งเมื่อ ยิ่งแล้ว ยิ่งใหญ่ ร่วมกัน รวมด้วย ร่วมด้วย รือว่า เร็ว เร็วๆ เราๆ เรียก เรียบ เรื่อย
|
||||
เรื่อยๆ ไร ล้วน ล้วนจน ล้วนแต่ ละ ล่าสุด เล็ก เล็กน้อย เล็กๆ เล่าว่า แล้วกัน แล้วแต่ แล้วเสร็จ วันใด วันนั้น วันนี้ วันไหน สบาย สมัย สมัยก่อน
|
||||
สมัยนั้น สมัยนี้ สมัยโน้น ส่วนเกิน ส่วนด้อย ส่วนดี ส่วนใด ส่วนที่ ส่วนน้อย ส่วนนั้น ส่วนมาก ส่วนใหญ่ สั้น สั้นๆ สามารถ สำคัญ สิ่ง
|
||||
สิ่งใด สิ่งนั้น สิ่งนี้ สิ่งไหน สิ้น เสร็จแล้ว เสียด้วย เสียแล้ว แสดง แสดงว่า หน หนอ หนอย หน่อย หมด หมดกัน หมดสิ้น หรือไง หรือเปล่า หรือไม่ หรือยัง
|
||||
หรือไร หากแม้ หากแม้น หากแม้นว่า หากว่า หาความ หาใช่ หารือ เหตุ เหตุผล เหตุนั้น เหตุนี้ เหตุไร เห็นแก่ เห็นควร เห็นจะ เห็นว่า เหลือ เหลือเกิน เหล่า
|
||||
เหล่านั้น เหล่านี้ แห่งใด แห่งนั้น แห่งนี้ แห่งโน้น แห่งไหน แหละ ให้แก่ ใหญ่ ใหญ่โต อย่างเช่น อย่างดี อย่างเดียว อย่างใด อย่างที่ อย่างน้อย อย่างนั้น อย่างนี้
|
||||
อย่างโน้น อย่างมาก อย่างยิ่ง อย่างไร อย่างไรก็ อย่างไรก็ได้ อย่างไรเสีย อย่างละ อย่างหนึ่ง อย่างไหน อย่างๆ อัน อันจะ อันใด อันได้แก่ อันที่
|
||||
อันที่จริง อันที่จะ อันเนื่องมาจาก อันละ อันไหน อันๆ อาจจะ อาจเป็น อาจเป็นด้วย อื่น อื่นๆ เอ็ง เอา ฯ ฯล ฯลฯ
|
||||
""".split())
|
81
spacy/lang/th/tag_map.py
Normal file
81
spacy/lang/th/tag_map.py
Normal file
|
@ -0,0 +1,81 @@
|
|||
# encoding: utf8
|
||||
# data from Korakot Chaovavanich (https://www.facebook.com/photo.php?fbid=390564854695031&set=p.390564854695031&type=3&permPage=1&ifg=1)
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import *
|
||||
|
||||
TAG_MAP = {
|
||||
#NOUN
|
||||
"NOUN": {POS: NOUN},
|
||||
"NCMN": {POS: NOUN},
|
||||
"NTTL": {POS: NOUN},
|
||||
"CNIT": {POS: NOUN},
|
||||
"CLTV": {POS: NOUN},
|
||||
"CMTR": {POS: NOUN},
|
||||
"CFQC": {POS: NOUN},
|
||||
"CVBL": {POS: NOUN},
|
||||
#PRON
|
||||
"PRON": {POS: PRON},
|
||||
"NPRP": {POS: PRON},
|
||||
# ADJ
|
||||
"ADJ": {POS: ADJ},
|
||||
"NONM": {POS: ADJ},
|
||||
"VATT": {POS: ADJ},
|
||||
"DONM": {POS: ADJ},
|
||||
# ADV
|
||||
"ADV": {POS: ADV},
|
||||
"ADVN": {POS: ADV},
|
||||
"ADVI": {POS: ADV},
|
||||
"ADVP": {POS: ADV},
|
||||
"ADVS": {POS: ADV},
|
||||
# INT
|
||||
"INT": {POS: INTJ},
|
||||
# PRON
|
||||
"PROPN": {POS: PROPN},
|
||||
"PPRS": {POS: PROPN},
|
||||
"PDMN": {POS: PROPN},
|
||||
"PNTR": {POS: PROPN},
|
||||
# DET
|
||||
"DET": {POS: DET},
|
||||
"DDAN": {POS: DET},
|
||||
"DDAC": {POS: DET},
|
||||
"DDBQ": {POS: DET},
|
||||
"DDAQ": {POS: DET},
|
||||
"DIAC": {POS: DET},
|
||||
"DIBQ": {POS: DET},
|
||||
"DIAQ": {POS: DET},
|
||||
"DCNM": {POS: DET},
|
||||
# NUM
|
||||
"NUM": {POS: NUM},
|
||||
"NCNM": {POS: NUM},
|
||||
"NLBL": {POS: NUM},
|
||||
"DCNM": {POS: NUM},
|
||||
# AUX
|
||||
"AUX": {POS: AUX},
|
||||
"XVBM": {POS: AUX},
|
||||
"XVAM": {POS: AUX},
|
||||
"XVMM": {POS: AUX},
|
||||
"XVBB": {POS: AUX},
|
||||
"XVAE": {POS: AUX},
|
||||
# ADP
|
||||
"ADP": {POS: ADP},
|
||||
"RPRE": {POS: ADP},
|
||||
# CCONJ
|
||||
"CCONJ": {POS: CCONJ},
|
||||
"JCRG": {POS: CCONJ},
|
||||
# SCONJ
|
||||
"SCONJ": {POS: SCONJ},
|
||||
"PREL": {POS: SCONJ},
|
||||
"JSBR": {POS: SCONJ},
|
||||
"JCMP": {POS: SCONJ},
|
||||
# PART
|
||||
"PART": {POS: PART},
|
||||
"FIXN": {POS: PART},
|
||||
"FIXV": {POS: PART},
|
||||
"EAFF": {POS: PART},
|
||||
"AITT": {POS: PART},
|
||||
"NEG": {POS: PART},
|
||||
# PUNCT
|
||||
"PUNCT": {POS: PUNCT},
|
||||
"PUNC": {POS: PUNCT}
|
||||
}
|
43
spacy/lang/th/tokenizer_exceptions.py
Normal file
43
spacy/lang/th/tokenizer_exceptions.py
Normal file
|
@ -0,0 +1,43 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import *
|
||||
|
||||
TOKENIZER_EXCEPTIONS = {
|
||||
"ม.ค.": [
|
||||
{ORTH: "ม.ค.", LEMMA: "มกราคม"}
|
||||
],
|
||||
"ก.พ.": [
|
||||
{ORTH: "ก.พ.", LEMMA: "กุมภาพันธ์"}
|
||||
],
|
||||
"มี.ค.": [
|
||||
{ORTH: "มี.ค.", LEMMA: "มีนาคม"}
|
||||
],
|
||||
"เม.ย.": [
|
||||
{ORTH: "เม.ย.", LEMMA: "เมษายน"}
|
||||
],
|
||||
"พ.ค.": [
|
||||
{ORTH: "พ.ค.", LEMMA: "พฤษภาคม"}
|
||||
],
|
||||
"มิ.ย.": [
|
||||
{ORTH: "มิ.ย.", LEMMA: "มิถุนายน"}
|
||||
],
|
||||
"ก.ค.": [
|
||||
{ORTH: "ก.ค.", LEMMA: "กรกฎาคม"}
|
||||
],
|
||||
"ส.ค.": [
|
||||
{ORTH: "ส.ค.", LEMMA: "สิงหาคม"}
|
||||
],
|
||||
"ก.ย.": [
|
||||
{ORTH: "ก.ย.", LEMMA: "กันยายน"}
|
||||
],
|
||||
"ต.ค.": [
|
||||
{ORTH: "ต.ค.", LEMMA: "ตุลาคม"}
|
||||
],
|
||||
"พ.ย.": [
|
||||
{ORTH: "พ.ย.", LEMMA: "พฤศจิกายน"}
|
||||
],
|
||||
"ธ.ค.": [
|
||||
{ORTH: "ธ.ค.", LEMMA: "ธันวาคม"}
|
||||
]
|
||||
}
|
|
@ -14,7 +14,7 @@ class Chinese(Language):
|
|||
except ImportError:
|
||||
raise ImportError("The Chinese tokenizer requires the Jieba library: "
|
||||
"https://github.com/fxsjy/jieba")
|
||||
words = list(jieba.cut(text, cut_all=True))
|
||||
words = list(jieba.cut(text, cut_all=False))
|
||||
words = [x for x in words if x]
|
||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||
|
||||
|
|
|
@ -34,6 +34,7 @@ from .lang.tag_map import TAG_MAP
|
|||
from .lang.lex_attrs import LEX_ATTRS
|
||||
from . import util
|
||||
from .scorer import Scorer
|
||||
from ._ml import link_vectors_to_models
|
||||
|
||||
|
||||
class BaseDefaults(object):
|
||||
|
@ -278,8 +279,7 @@ class Language(object):
|
|||
def make_doc(self, text):
|
||||
return self.tokenizer(text)
|
||||
|
||||
def update(self, docs, golds, drop=0., sgd=None, losses=None,
|
||||
update_shared=False):
|
||||
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
||||
"""Update the models in the pipeline.
|
||||
|
||||
docs (iterable): A batch of `Doc` objects.
|
||||
|
@ -303,32 +303,17 @@ class Language(object):
|
|||
if self._optimizer is None:
|
||||
self._optimizer = Adam(Model.ops, 0.001)
|
||||
sgd = self._optimizer
|
||||
tok2vec = self.pipeline[0]
|
||||
feats = tok2vec.doc2feats(docs)
|
||||
grads = {}
|
||||
def get_grads(W, dW, key=None):
|
||||
grads[key] = (W, dW)
|
||||
pipes = list(self.pipeline[1:])
|
||||
pipes = list(self.pipeline)
|
||||
random.shuffle(pipes)
|
||||
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
|
||||
all_d_tokvecses = [tok2vec.model.ops.allocate(tv.shape) for tv in tokvecses]
|
||||
for proc in pipes:
|
||||
if not hasattr(proc, 'update'):
|
||||
continue
|
||||
d_tokvecses = proc.update((docs, tokvecses), golds,
|
||||
drop=drop, sgd=get_grads, losses=losses)
|
||||
if update_shared and d_tokvecses is not None:
|
||||
for i, d_tv in enumerate(d_tokvecses):
|
||||
all_d_tokvecses[i] += d_tv
|
||||
if update_shared and bp_tokvecses is not None:
|
||||
bp_tokvecses(all_d_tokvecses, sgd=sgd)
|
||||
proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses)
|
||||
for key, (W, dW) in grads.items():
|
||||
sgd(W, dW, key=key)
|
||||
# Clear the tensor variable, to free GPU memory.
|
||||
# If we don't do this, the memory leak gets pretty
|
||||
# bad, because we may be holding part of a batch.
|
||||
for doc in docs:
|
||||
doc.tensor = None
|
||||
|
||||
def preprocess_gold(self, docs_golds):
|
||||
"""Can be called before training to pre-process gold data. By default,
|
||||
|
@ -343,36 +328,49 @@ class Language(object):
|
|||
for doc, gold in docs_golds:
|
||||
yield doc, gold
|
||||
|
||||
def begin_training(self, get_gold_tuples, **cfg):
|
||||
def resume_training(self, **cfg):
|
||||
if cfg.get('device', -1) >= 0:
|
||||
device = util.use_gpu(cfg['device'])
|
||||
if self.vocab.vectors.data.shape[1] >= 1:
|
||||
self.vocab.vectors.data = Model.ops.asarray(
|
||||
self.vocab.vectors.data)
|
||||
else:
|
||||
device = None
|
||||
learn_rate = util.env_opt('learn_rate', 0.001)
|
||||
beta1 = util.env_opt('optimizer_B1', 0.9)
|
||||
beta2 = util.env_opt('optimizer_B2', 0.999)
|
||||
eps = util.env_opt('optimizer_eps', 1e-08)
|
||||
L2 = util.env_opt('L2_penalty', 1e-6)
|
||||
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
|
||||
self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
|
||||
beta2=beta2, eps=eps)
|
||||
self._optimizer.max_grad_norm = max_grad_norm
|
||||
self._optimizer.device = device
|
||||
return self._optimizer
|
||||
|
||||
def begin_training(self, get_gold_tuples=None, **cfg):
|
||||
"""Allocate models, pre-process training data and acquire a trainer and
|
||||
optimizer. Used as a contextmanager.
|
||||
|
||||
gold_tuples (iterable): Gold-standard training data.
|
||||
get_gold_tuples (function): Function returning gold data
|
||||
**cfg: Config parameters.
|
||||
YIELDS (tuple): A trainer and an optimizer.
|
||||
|
||||
EXAMPLE:
|
||||
>>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
|
||||
>>> for epoch in trainer.epochs(gold):
|
||||
>>> for docs, golds in epoch:
|
||||
>>> state = nlp.update(docs, golds, sgd=optimizer)
|
||||
returns: An optimizer
|
||||
"""
|
||||
if self.parser:
|
||||
self.pipeline.append(NeuralLabeller(self.vocab))
|
||||
# Populate vocab
|
||||
if get_gold_tuples is not None:
|
||||
for _, annots_brackets in get_gold_tuples():
|
||||
for annots, _ in annots_brackets:
|
||||
for word in annots[1]:
|
||||
_ = self.vocab[word]
|
||||
contexts = []
|
||||
if cfg.get('device', -1) >= 0:
|
||||
import cupy.cuda.device
|
||||
device = cupy.cuda.device.Device(cfg['device'])
|
||||
device.use()
|
||||
Model.ops = CupyOps()
|
||||
Model.Ops = CupyOps
|
||||
device = util.use_gpu(cfg['device'])
|
||||
if self.vocab.vectors.data.shape[1] >= 1:
|
||||
self.vocab.vectors.data = Model.ops.asarray(
|
||||
self.vocab.vectors.data)
|
||||
else:
|
||||
device = None
|
||||
link_vectors_to_models(self.vocab)
|
||||
for proc in self.pipeline:
|
||||
if hasattr(proc, 'begin_training'):
|
||||
context = proc.begin_training(get_gold_tuples(),
|
||||
|
@ -390,7 +388,7 @@ class Language(object):
|
|||
self._optimizer.device = device
|
||||
return self._optimizer
|
||||
|
||||
def evaluate(self, docs_golds):
|
||||
def evaluate(self, docs_golds, verbose=False):
|
||||
scorer = Scorer()
|
||||
docs, golds = zip(*docs_golds)
|
||||
docs = list(docs)
|
||||
|
@ -403,8 +401,9 @@ class Language(object):
|
|||
docs = list(pipe.pipe(docs))
|
||||
assert len(docs) == len(golds)
|
||||
for doc, gold in zip(docs, golds):
|
||||
scorer.score(doc, gold)
|
||||
doc.tensor = None
|
||||
if verbose:
|
||||
print(doc)
|
||||
scorer.score(doc, gold, verbose=verbose)
|
||||
return scorer
|
||||
|
||||
@contextmanager
|
||||
|
@ -493,7 +492,6 @@ class Language(object):
|
|||
"""
|
||||
path = util.ensure_path(path)
|
||||
serializers = OrderedDict((
|
||||
('vocab', lambda p: self.vocab.to_disk(p)),
|
||||
('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)),
|
||||
('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
|
||||
))
|
||||
|
@ -505,6 +503,7 @@ class Language(object):
|
|||
if not hasattr(proc, 'to_disk'):
|
||||
continue
|
||||
serializers[proc.name] = lambda p, proc=proc: proc.to_disk(p, vocab=False)
|
||||
serializers['vocab'] = lambda p: self.vocab.to_disk(p)
|
||||
util.to_disk(path, serializers, {p: False for p in disable})
|
||||
|
||||
def from_disk(self, path, disable=tuple()):
|
||||
|
|
|
@ -38,7 +38,8 @@ class Lemmatizer(object):
|
|||
avoid lemmatization entirely.
|
||||
"""
|
||||
morphology = {} if morphology is None else morphology
|
||||
others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
|
||||
others = [key for key in morphology
|
||||
if key not in (POS, 'Number', 'POS', 'VerbForm', 'Tense')]
|
||||
true_morph_key = morphology.get('morph', 0)
|
||||
if univ_pos == 'noun' and morphology.get('Number') == 'sing':
|
||||
return True
|
||||
|
@ -47,7 +48,9 @@ class Lemmatizer(object):
|
|||
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
|
||||
# morphology
|
||||
elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and \
|
||||
morphology.get('Tense') == 'pres'):
|
||||
morphology.get('Tense') == 'pres' and \
|
||||
morphology.get('Number') is None and \
|
||||
not others):
|
||||
return True
|
||||
elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
|
||||
return True
|
||||
|
|
|
@ -421,47 +421,69 @@ cdef class PhraseMatcher:
|
|||
cdef int max_length
|
||||
cdef attr_t* _phrase_key
|
||||
|
||||
def __init__(self, Vocab vocab, phrases, max_length=10):
|
||||
cdef public object _callbacks
|
||||
cdef public object _patterns
|
||||
|
||||
def __init__(self, Vocab vocab, max_length=10):
|
||||
self.mem = Pool()
|
||||
self._phrase_key = <attr_t*>self.mem.alloc(max_length, sizeof(attr_t))
|
||||
self.max_length = max_length
|
||||
self.vocab = vocab
|
||||
self.matcher = Matcher(self.vocab, {})
|
||||
self.matcher = Matcher(self.vocab)
|
||||
self.phrase_ids = PreshMap()
|
||||
for phrase in phrases:
|
||||
if len(phrase) < max_length:
|
||||
self.add(phrase)
|
||||
|
||||
abstract_patterns = []
|
||||
for length in range(1, max_length):
|
||||
abstract_patterns.append([{tag: True} for tag in get_bilou(length)])
|
||||
self.matcher.add('Candidate', 'MWE', {}, abstract_patterns, acceptor=self.accept_match)
|
||||
self.matcher.add('Candidate', None, *abstract_patterns)
|
||||
self._callbacks = {}
|
||||
|
||||
def add(self, Doc tokens):
|
||||
cdef int length = tokens.length
|
||||
assert length < self.max_length
|
||||
tags = get_bilou(length)
|
||||
assert len(tags) == length, length
|
||||
def __len__(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def __contains__(self, key):
|
||||
raise NotImplementedError
|
||||
|
||||
def __reduce__(self):
|
||||
return (self.__class__, (self.vocab,), None, None)
|
||||
|
||||
def add(self, key, on_match, *docs):
|
||||
cdef Doc doc
|
||||
for doc in docs:
|
||||
if len(doc) >= self.max_length:
|
||||
msg = (
|
||||
"Pattern length (%d) >= phrase_matcher.max_length (%d). "
|
||||
"Length can be set on initialization, up to 10."
|
||||
)
|
||||
raise ValueError(msg % (len(doc), self.max_length))
|
||||
cdef hash_t ent_id = self.matcher._normalize_key(key)
|
||||
self._callbacks[ent_id] = on_match
|
||||
|
||||
cdef int length
|
||||
cdef int i
|
||||
cdef hash_t phrase_hash
|
||||
for doc in docs:
|
||||
length = doc.length
|
||||
tags = get_bilou(length)
|
||||
for i in range(self.max_length):
|
||||
self._phrase_key[i] = 0
|
||||
for i, tag in enumerate(tags):
|
||||
lexeme = self.vocab[tokens.c[i].lex.orth]
|
||||
lexeme = self.vocab[doc.c[i].lex.orth]
|
||||
lexeme.set_flag(tag, True)
|
||||
self._phrase_key[i] = lexeme.orth
|
||||
cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0)
|
||||
self.phrase_ids[key] = True
|
||||
phrase_hash = hash64(self._phrase_key,
|
||||
self.max_length * sizeof(attr_t), 0)
|
||||
self.phrase_ids.set(phrase_hash, <void*>ent_id)
|
||||
|
||||
def __call__(self, Doc doc):
|
||||
matches = []
|
||||
for ent_id, label, start, end in self.matcher(doc):
|
||||
cand = doc[start : end]
|
||||
start = cand[0].idx
|
||||
end = cand[-1].idx + len(cand[-1])
|
||||
matches.append((start, end, cand.root.tag_, cand.text, 'MWE'))
|
||||
for match in matches:
|
||||
doc.merge(*match)
|
||||
for _, start, end in self.matcher(doc):
|
||||
ent_id = self.accept_match(doc, start, end)
|
||||
if ent_id is not None:
|
||||
matches.append((ent_id, start, end))
|
||||
for i, (ent_id, start, end) in enumerate(matches):
|
||||
on_match = self._callbacks.get(ent_id)
|
||||
if on_match is not None:
|
||||
on_match(self, doc, i, matches)
|
||||
return matches
|
||||
|
||||
def pipe(self, stream, batch_size=1000, n_threads=2):
|
||||
|
@ -469,7 +491,7 @@ cdef class PhraseMatcher:
|
|||
self(doc)
|
||||
yield doc
|
||||
|
||||
def accept_match(self, Doc doc, attr_t ent_id, attr_t label, int start, int end):
|
||||
def accept_match(self, Doc doc, int start, int end):
|
||||
assert (end - start) < self.max_length
|
||||
cdef int i, j
|
||||
for i in range(self.max_length):
|
||||
|
@ -477,7 +499,8 @@ cdef class PhraseMatcher:
|
|||
for i, j in enumerate(range(start, end)):
|
||||
self._phrase_key[i] = doc.c[j].lex.orth
|
||||
cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0)
|
||||
if self.phrase_ids.get(key):
|
||||
return (ent_id, label, start, end)
|
||||
ent_id = <hash_t>self.phrase_ids.get(key)
|
||||
if ent_id == 0:
|
||||
return None
|
||||
else:
|
||||
return False
|
||||
return ent_id
|
||||
|
|
|
@ -146,6 +146,8 @@ cdef class Morphology:
|
|||
self.add_special_case(tag_str, form_str, attrs)
|
||||
|
||||
def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
|
||||
if orth not in self.strings:
|
||||
return orth
|
||||
cdef unicode py_string = self.strings[orth]
|
||||
if self.lemmatizer is None:
|
||||
return self.strings.add(py_string.lower())
|
||||
|
|
|
@ -4,7 +4,6 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from thinc.api import chain, layerize, with_getitem
|
||||
from thinc.neural import Model, Softmax
|
||||
import numpy
|
||||
cimport numpy as np
|
||||
import cytoolz
|
||||
|
@ -14,17 +13,18 @@ import ujson
|
|||
import msgpack
|
||||
|
||||
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
|
||||
from thinc.neural import Model, Maxout, Softmax, Affine
|
||||
from thinc.neural._classes.hash_embed import HashEmbed
|
||||
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
|
||||
from thinc.i2v import HashEmbed
|
||||
from thinc.t2v import Pooling, max_pool, mean_pool, sum_pool
|
||||
from thinc.t2t import ExtractWindow, ParametricAttention
|
||||
from thinc.misc import Residual
|
||||
from thinc.misc import BatchNorm as BN
|
||||
from thinc.misc import LayerNorm as LN
|
||||
|
||||
from thinc.neural.util import to_categorical
|
||||
|
||||
from thinc.neural.pooling import Pooling, max_pool, mean_pool
|
||||
from thinc.neural._classes.difference import Siamese, CauchySimilarity
|
||||
|
||||
from thinc.neural._classes.convolution import ExtractWindow
|
||||
from thinc.neural._classes.resnet import Residual
|
||||
from thinc.neural._classes.batchnorm import BatchNorm as BN
|
||||
|
||||
from .tokens.doc cimport Doc
|
||||
from .syntax.parser cimport Parser as LinearParser
|
||||
from .syntax.nn_parser cimport Parser as NeuralParser
|
||||
|
@ -41,13 +41,14 @@ from .syntax import nonproj
|
|||
from .compat import json_dumps
|
||||
|
||||
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
|
||||
from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats
|
||||
from ._ml import rebatch, Tok2Vec, flatten
|
||||
from ._ml import build_text_classifier, build_tagger_model
|
||||
from ._ml import link_vectors_to_models
|
||||
from .parts_of_speech import X
|
||||
|
||||
|
||||
class SentenceSegmenter(object):
|
||||
'''A simple spaCy hook, to allow custom sentence boundary detection logic
|
||||
"""A simple spaCy hook, to allow custom sentence boundary detection logic
|
||||
(that doesn't require the dependency parse).
|
||||
|
||||
To change the sentence boundary detection strategy, pass a generator
|
||||
|
@ -56,7 +57,7 @@ class SentenceSegmenter(object):
|
|||
|
||||
Sentence detection strategies should be generators that take `Doc` objects
|
||||
and yield `Span` objects for each sentence.
|
||||
'''
|
||||
"""
|
||||
name = 'sbd'
|
||||
|
||||
def __init__(self, vocab, strategy=None):
|
||||
|
@ -88,17 +89,30 @@ class BaseThincComponent(object):
|
|||
|
||||
@classmethod
|
||||
def Model(cls, *shape, **kwargs):
|
||||
"""Initialize a model for the pipe."""
|
||||
raise NotImplementedError
|
||||
|
||||
def __init__(self, vocab, model=True, **cfg):
|
||||
"""Create a new pipe instance."""
|
||||
raise NotImplementedError
|
||||
|
||||
def __call__(self, doc):
|
||||
"""Apply the pipe to one document. The document is
|
||||
modified in-place, and returned.
|
||||
|
||||
Both __call__ and pipe should delegate to the `predict()`
|
||||
and `set_annotations()` methods.
|
||||
"""
|
||||
scores = self.predict([doc])
|
||||
self.set_annotations([doc], scores)
|
||||
return doc
|
||||
|
||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||
"""Apply the pipe to a stream of documents.
|
||||
|
||||
Both __call__ and pipe should delegate to the `predict()`
|
||||
and `set_annotations()` methods.
|
||||
"""
|
||||
for docs in cytoolz.partition_all(batch_size, stream):
|
||||
docs = list(docs)
|
||||
scores = self.predict(docs)
|
||||
|
@ -106,27 +120,43 @@ class BaseThincComponent(object):
|
|||
yield from docs
|
||||
|
||||
def predict(self, docs):
|
||||
"""Apply the pipeline's model to a batch of docs, without
|
||||
modifying them.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def set_annotations(self, docs, scores):
|
||||
"""Modify a batch of documents, using pre-computed scores."""
|
||||
raise NotImplementedError
|
||||
|
||||
def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None):
|
||||
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
||||
"""Learn from a batch of documents and gold-standard information,
|
||||
updating the pipe's model.
|
||||
|
||||
Delegates to predict() and get_loss().
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def get_loss(self, docs, golds, scores):
|
||||
"""Find the loss and gradient of loss for the batch of
|
||||
documents and their predicted scores."""
|
||||
raise NotImplementedError
|
||||
|
||||
def begin_training(self, gold_tuples=tuple(), pipeline=None):
|
||||
token_vector_width = pipeline[0].model.nO
|
||||
"""Initialize the pipe for training, using data exampes if available.
|
||||
If no model has been initialized yet, the model is added."""
|
||||
if self.model is True:
|
||||
self.model = self.Model(1, token_vector_width)
|
||||
self.model = self.Model(**self.cfg)
|
||||
link_vectors_to_models(self.vocab)
|
||||
|
||||
def use_params(self, params):
|
||||
"""Modify the pipe's model, to use the given parameter values.
|
||||
"""
|
||||
with self.model.use_params(params):
|
||||
yield
|
||||
|
||||
def to_bytes(self, **exclude):
|
||||
"""Serialize the pipe to a bytestring."""
|
||||
serialize = OrderedDict((
|
||||
('cfg', lambda: json_dumps(self.cfg)),
|
||||
('model', lambda: self.model.to_bytes()),
|
||||
|
@ -135,37 +165,42 @@ class BaseThincComponent(object):
|
|||
return util.to_bytes(serialize, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data, **exclude):
|
||||
"""Load the pipe from a bytestring."""
|
||||
def load_model(b):
|
||||
if self.model is True:
|
||||
self.cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||
self.model = self.Model(**self.cfg)
|
||||
self.model.from_bytes(b)
|
||||
|
||||
deserialize = OrderedDict((
|
||||
('cfg', lambda b: self.cfg.update(ujson.loads(b))),
|
||||
('vocab', lambda b: self.vocab.from_bytes(b)),
|
||||
('model', load_model),
|
||||
('vocab', lambda b: self.vocab.from_bytes(b))
|
||||
))
|
||||
util.from_bytes(bytes_data, deserialize, exclude)
|
||||
return self
|
||||
|
||||
def to_disk(self, path, **exclude):
|
||||
"""Serialize the pipe to disk."""
|
||||
serialize = OrderedDict((
|
||||
('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))),
|
||||
('vocab', lambda p: self.vocab.to_disk(p)),
|
||||
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
|
||||
('vocab', lambda p: self.vocab.to_disk(p))
|
||||
))
|
||||
util.to_disk(path, serialize, exclude)
|
||||
|
||||
def from_disk(self, path, **exclude):
|
||||
"""Load the pipe from disk."""
|
||||
def load_model(p):
|
||||
if self.model is True:
|
||||
self.cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||
self.model = self.Model(**self.cfg)
|
||||
self.model.from_bytes(p.open('rb').read())
|
||||
|
||||
deserialize = OrderedDict((
|
||||
('cfg', lambda p: self.cfg.update(_load_cfg(p))),
|
||||
('model', load_model),
|
||||
('vocab', lambda p: self.vocab.from_disk(p)),
|
||||
('model', load_model),
|
||||
))
|
||||
util.from_disk(path, deserialize, exclude)
|
||||
return self
|
||||
|
@ -193,7 +228,7 @@ class TokenVectorEncoder(BaseThincComponent):
|
|||
"""
|
||||
width = util.env_opt('token_vector_width', width)
|
||||
embed_size = util.env_opt('embed_size', embed_size)
|
||||
return Tok2Vec(width, embed_size, preprocess=None)
|
||||
return Tok2Vec(width, embed_size, **cfg)
|
||||
|
||||
def __init__(self, vocab, model=True, **cfg):
|
||||
"""Construct a new statistical model. Weights are not allocated on
|
||||
|
@ -210,9 +245,10 @@ class TokenVectorEncoder(BaseThincComponent):
|
|||
>>> tok2vec.model = tok2vec.Model(128, 5000)
|
||||
"""
|
||||
self.vocab = vocab
|
||||
self.doc2feats = doc2feats()
|
||||
self.model = model
|
||||
self.cfg = dict(cfg)
|
||||
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
||||
self.cfg.setdefault('cnn_maxout_pieces', 3)
|
||||
|
||||
def __call__(self, doc):
|
||||
"""Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
|
||||
|
@ -245,8 +281,7 @@ class TokenVectorEncoder(BaseThincComponent):
|
|||
docs (iterable): A sequence of `Doc` objects.
|
||||
RETURNS (object): Vector representations for each token in the documents.
|
||||
"""
|
||||
feats = self.doc2feats(docs)
|
||||
tokvecs = self.model(feats)
|
||||
tokvecs = self.model(docs)
|
||||
return tokvecs
|
||||
|
||||
def set_annotations(self, docs, tokvecses):
|
||||
|
@ -270,8 +305,7 @@ class TokenVectorEncoder(BaseThincComponent):
|
|||
"""
|
||||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
feats = self.doc2feats(docs)
|
||||
tokvecs, bp_tokvecs = self.model.begin_update(feats, drop=drop)
|
||||
tokvecs, bp_tokvecs = self.model.begin_update(docs, drop=drop)
|
||||
return tokvecs, bp_tokvecs
|
||||
|
||||
def get_loss(self, docs, golds, scores):
|
||||
|
@ -285,9 +319,10 @@ class TokenVectorEncoder(BaseThincComponent):
|
|||
gold_tuples (iterable): Gold-standard training data.
|
||||
pipeline (list): The pipeline the model is part of.
|
||||
"""
|
||||
self.doc2feats = doc2feats()
|
||||
if self.model is True:
|
||||
self.model = self.Model()
|
||||
self.cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||
self.model = self.Model(**self.cfg)
|
||||
link_vectors_to_models(self.vocab)
|
||||
|
||||
|
||||
class NeuralTagger(BaseThincComponent):
|
||||
|
@ -296,29 +331,29 @@ class NeuralTagger(BaseThincComponent):
|
|||
self.vocab = vocab
|
||||
self.model = model
|
||||
self.cfg = dict(cfg)
|
||||
self.cfg.setdefault('cnn_maxout_pieces', 2)
|
||||
self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
|
||||
|
||||
def __call__(self, doc):
|
||||
tags = self.predict(([doc], [doc.tensor]))
|
||||
tags = self.predict([doc])
|
||||
self.set_annotations([doc], tags)
|
||||
return doc
|
||||
|
||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||
for docs in cytoolz.partition_all(batch_size, stream):
|
||||
docs = list(docs)
|
||||
tokvecs = [d.tensor for d in docs]
|
||||
tag_ids = self.predict((docs, tokvecs))
|
||||
tag_ids = self.predict(docs)
|
||||
self.set_annotations(docs, tag_ids)
|
||||
yield from docs
|
||||
|
||||
def predict(self, docs_tokvecs):
|
||||
scores = self.model(docs_tokvecs)
|
||||
def predict(self, docs):
|
||||
scores = self.model(docs)
|
||||
scores = self.model.ops.flatten(scores)
|
||||
guesses = scores.argmax(axis=1)
|
||||
if not isinstance(guesses, numpy.ndarray):
|
||||
guesses = guesses.get()
|
||||
tokvecs = docs_tokvecs[1]
|
||||
guesses = self.model.ops.unflatten(guesses,
|
||||
[tv.shape[0] for tv in tokvecs])
|
||||
[len(d) for d in docs])
|
||||
return guesses
|
||||
|
||||
def set_annotations(self, docs, batch_tag_ids):
|
||||
|
@ -338,20 +373,16 @@ class NeuralTagger(BaseThincComponent):
|
|||
idx += 1
|
||||
doc.is_tagged = True
|
||||
|
||||
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
|
||||
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
||||
if losses is not None and self.name not in losses:
|
||||
losses[self.name] = 0.
|
||||
docs, tokvecs = docs_tokvecs
|
||||
|
||||
if self.model.nI is None:
|
||||
self.model.nI = tokvecs[0].shape[1]
|
||||
tag_scores, bp_tag_scores = self.model.begin_update(docs_tokvecs, drop=drop)
|
||||
tag_scores, bp_tag_scores = self.model.begin_update(docs, drop=drop)
|
||||
loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
|
||||
bp_tag_scores(d_tag_scores, sgd=sgd)
|
||||
|
||||
d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
|
||||
if losses is not None:
|
||||
losses[self.name] += loss
|
||||
return d_tokvecs
|
||||
|
||||
def get_loss(self, docs, golds, scores):
|
||||
scores = self.model.ops.flatten(scores)
|
||||
|
@ -392,13 +423,14 @@ class NeuralTagger(BaseThincComponent):
|
|||
vocab.morphology = Morphology(vocab.strings, new_tag_map,
|
||||
vocab.morphology.lemmatizer,
|
||||
exc=vocab.morphology.exc)
|
||||
token_vector_width = pipeline[0].model.nO
|
||||
if self.model is True:
|
||||
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
|
||||
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
||||
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
|
||||
link_vectors_to_models(self.vocab)
|
||||
|
||||
@classmethod
|
||||
def Model(cls, n_tags, token_vector_width):
|
||||
return build_tagger_model(n_tags, token_vector_width)
|
||||
def Model(cls, n_tags, **cfg):
|
||||
return build_tagger_model(n_tags, **cfg)
|
||||
|
||||
def use_params(self, params):
|
||||
with self.model.use_params(params):
|
||||
|
@ -419,7 +451,7 @@ class NeuralTagger(BaseThincComponent):
|
|||
if self.model is True:
|
||||
token_vector_width = util.env_opt('token_vector_width',
|
||||
self.cfg.get('token_vector_width', 128))
|
||||
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
|
||||
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
|
||||
self.model.from_bytes(b)
|
||||
|
||||
def load_tag_map(b):
|
||||
|
@ -438,6 +470,7 @@ class NeuralTagger(BaseThincComponent):
|
|||
return self
|
||||
|
||||
def to_disk(self, path, **exclude):
|
||||
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
||||
serialize = OrderedDict((
|
||||
('vocab', lambda p: self.vocab.to_disk(p)),
|
||||
('tag_map', lambda p: p.open('wb').write(msgpack.dumps(
|
||||
|
@ -452,9 +485,7 @@ class NeuralTagger(BaseThincComponent):
|
|||
def from_disk(self, path, **exclude):
|
||||
def load_model(p):
|
||||
if self.model is True:
|
||||
token_vector_width = util.env_opt('token_vector_width',
|
||||
self.cfg.get('token_vector_width', 128))
|
||||
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
|
||||
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
|
||||
self.model.from_bytes(p.open('rb').read())
|
||||
|
||||
def load_tag_map(p):
|
||||
|
@ -466,10 +497,10 @@ class NeuralTagger(BaseThincComponent):
|
|||
exc=self.vocab.morphology.exc)
|
||||
|
||||
deserialize = OrderedDict((
|
||||
('cfg', lambda p: self.cfg.update(_load_cfg(p))),
|
||||
('vocab', lambda p: self.vocab.from_disk(p)),
|
||||
('tag_map', load_tag_map),
|
||||
('model', load_model),
|
||||
('cfg', lambda p: self.cfg.update(_load_cfg(p)))
|
||||
))
|
||||
util.from_disk(path, deserialize, exclude)
|
||||
return self
|
||||
|
@ -477,10 +508,28 @@ class NeuralTagger(BaseThincComponent):
|
|||
|
||||
class NeuralLabeller(NeuralTagger):
|
||||
name = 'nn_labeller'
|
||||
def __init__(self, vocab, model=True, **cfg):
|
||||
def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
if target == 'dep':
|
||||
self.make_label = self.make_dep
|
||||
elif target == 'tag':
|
||||
self.make_label = self.make_tag
|
||||
elif target == 'ent':
|
||||
self.make_label = self.make_ent
|
||||
elif target == 'dep_tag_offset':
|
||||
self.make_label = self.make_dep_tag_offset
|
||||
elif target == 'ent_tag':
|
||||
self.make_label = self.make_ent_tag
|
||||
elif hasattr(target, '__call__'):
|
||||
self.make_label = target
|
||||
else:
|
||||
raise ValueError(
|
||||
"NeuralLabeller target should be function or one of "
|
||||
"['dep', 'tag', 'ent', 'dep_tag_offset', 'ent_tag']")
|
||||
self.cfg = dict(cfg)
|
||||
self.cfg.setdefault('cnn_maxout_pieces', 2)
|
||||
self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
|
||||
|
||||
@property
|
||||
def labels(self):
|
||||
|
@ -493,41 +542,79 @@ class NeuralLabeller(NeuralTagger):
|
|||
def set_annotations(self, docs, dep_ids):
|
||||
pass
|
||||
|
||||
def begin_training(self, gold_tuples=tuple(), pipeline=None):
|
||||
def begin_training(self, gold_tuples=tuple(), pipeline=None, tok2vec=None):
|
||||
gold_tuples = nonproj.preprocess_training_data(gold_tuples)
|
||||
for raw_text, annots_brackets in gold_tuples:
|
||||
for annots, brackets in annots_brackets:
|
||||
ids, words, tags, heads, deps, ents = annots
|
||||
for dep in deps:
|
||||
if dep not in self.labels:
|
||||
self.labels[dep] = len(self.labels)
|
||||
token_vector_width = pipeline[0].model.nO
|
||||
for i in range(len(ids)):
|
||||
label = self.make_label(i, words, tags, heads, deps, ents)
|
||||
if label is not None and label not in self.labels:
|
||||
self.labels[label] = len(self.labels)
|
||||
print(len(self.labels))
|
||||
if self.model is True:
|
||||
self.model = self.Model(len(self.labels), token_vector_width)
|
||||
token_vector_width = util.env_opt('token_vector_width')
|
||||
self.model = chain(
|
||||
tok2vec,
|
||||
Softmax(len(self.labels), token_vector_width)
|
||||
)
|
||||
link_vectors_to_models(self.vocab)
|
||||
|
||||
@classmethod
|
||||
def Model(cls, n_tags, token_vector_width):
|
||||
return build_tagger_model(n_tags, token_vector_width)
|
||||
def Model(cls, n_tags, tok2vec=None, **cfg):
|
||||
return build_tagger_model(n_tags, tok2vec=tok2vec, **cfg)
|
||||
|
||||
def get_loss(self, docs, golds, scores):
|
||||
scores = self.model.ops.flatten(scores)
|
||||
cdef int idx = 0
|
||||
correct = numpy.zeros((scores.shape[0],), dtype='i')
|
||||
guesses = scores.argmax(axis=1)
|
||||
for gold in golds:
|
||||
for tag in gold.labels:
|
||||
if tag is None or tag not in self.labels:
|
||||
for i in range(len(gold.labels)):
|
||||
label = self.make_label(i, gold.words, gold.tags, gold.heads,
|
||||
gold.labels, gold.ents)
|
||||
if label is None or label not in self.labels:
|
||||
correct[idx] = guesses[idx]
|
||||
else:
|
||||
correct[idx] = self.labels[tag]
|
||||
correct[idx] = self.labels[label]
|
||||
idx += 1
|
||||
correct = self.model.ops.xp.array(correct, dtype='i')
|
||||
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
|
||||
d_scores /= d_scores.shape[0]
|
||||
loss = (d_scores**2).sum()
|
||||
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
||||
return float(loss), d_scores
|
||||
|
||||
@staticmethod
|
||||
def make_dep(i, words, tags, heads, deps, ents):
|
||||
if deps[i] is None or heads[i] is None:
|
||||
return None
|
||||
return deps[i]
|
||||
|
||||
@staticmethod
|
||||
def make_tag(i, words, tags, heads, deps, ents):
|
||||
return tags[i]
|
||||
|
||||
@staticmethod
|
||||
def make_ent(i, words, tags, heads, deps, ents):
|
||||
if ents is None:
|
||||
return None
|
||||
return ents[i]
|
||||
|
||||
@staticmethod
|
||||
def make_dep_tag_offset(i, words, tags, heads, deps, ents):
|
||||
if deps[i] is None or heads[i] is None:
|
||||
return None
|
||||
offset = heads[i] - i
|
||||
offset = min(offset, 2)
|
||||
offset = max(offset, -2)
|
||||
return '%s-%s:%d' % (deps[i], tags[i], offset)
|
||||
|
||||
@staticmethod
|
||||
def make_ent_tag(i, words, tags, heads, deps, ents):
|
||||
if ents is None or ents[i] is None:
|
||||
return None
|
||||
else:
|
||||
return '%s-%s' % (tags[i], ents[i])
|
||||
|
||||
|
||||
class SimilarityHook(BaseThincComponent):
|
||||
"""
|
||||
|
@ -555,7 +642,7 @@ class SimilarityHook(BaseThincComponent):
|
|||
return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length))
|
||||
|
||||
def __call__(self, doc):
|
||||
'''Install similarity hook'''
|
||||
"""Install similarity hook"""
|
||||
doc.user_hooks['similarity'] = self.predict
|
||||
return doc
|
||||
|
||||
|
@ -564,15 +651,10 @@ class SimilarityHook(BaseThincComponent):
|
|||
yield self(doc)
|
||||
|
||||
def predict(self, doc1, doc2):
|
||||
return self.model.predict([(doc1.tensor, doc2.tensor)])
|
||||
return self.model.predict([(doc1, doc2)])
|
||||
|
||||
def update(self, doc1_tensor1_doc2_tensor2, golds, sgd=None, drop=0.):
|
||||
doc1s, tensor1s, doc2s, tensor2s = doc1_tensor1_doc2_tensor2
|
||||
sims, bp_sims = self.model.begin_update(zip(tensor1s, tensor2s),
|
||||
drop=drop)
|
||||
d_tensor1s, d_tensor2s = bp_sims(golds, sgd=sgd)
|
||||
|
||||
return d_tensor1s, d_tensor2s
|
||||
def update(self, doc1_doc2, golds, sgd=None, drop=0.):
|
||||
sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop)
|
||||
|
||||
def begin_training(self, _=tuple(), pipeline=None):
|
||||
"""
|
||||
|
@ -583,6 +665,7 @@ class SimilarityHook(BaseThincComponent):
|
|||
"""
|
||||
if self.model is True:
|
||||
self.model = self.Model(pipeline[0].model.nO)
|
||||
link_vectors_to_models(self.vocab)
|
||||
|
||||
|
||||
class TextCategorizer(BaseThincComponent):
|
||||
|
@ -627,15 +710,13 @@ class TextCategorizer(BaseThincComponent):
|
|||
for j, label in enumerate(self.labels):
|
||||
doc.cats[label] = float(scores[i, j])
|
||||
|
||||
def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None):
|
||||
docs, tensors = docs_tensors
|
||||
def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
|
||||
scores, bp_scores = self.model.begin_update(docs, drop=drop)
|
||||
loss, d_scores = self.get_loss(docs, golds, scores)
|
||||
d_tensors = bp_scores(d_scores, sgd=sgd)
|
||||
bp_scores(d_scores, sgd=sgd)
|
||||
if losses is not None:
|
||||
losses.setdefault(self.name, 0.0)
|
||||
losses[self.name] += loss
|
||||
return d_tensors
|
||||
|
||||
def get_loss(self, docs, golds, scores):
|
||||
truths = numpy.zeros((len(golds), len(self.labels)), dtype='f')
|
||||
|
@ -653,8 +734,10 @@ class TextCategorizer(BaseThincComponent):
|
|||
else:
|
||||
token_vector_width = 64
|
||||
if self.model is True:
|
||||
self.cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||
self.model = self.Model(len(self.labels), token_vector_width,
|
||||
**self.cfg)
|
||||
link_vectors_to_models(self.vocab)
|
||||
|
||||
|
||||
cdef class EntityRecognizer(LinearParser):
|
||||
|
@ -695,6 +778,14 @@ cdef class NeuralDependencyParser(NeuralParser):
|
|||
name = 'parser'
|
||||
TransitionSystem = ArcEager
|
||||
|
||||
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
|
||||
for target in []:
|
||||
labeller = NeuralLabeller(self.vocab, target=target)
|
||||
tok2vec = self.model[0]
|
||||
labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
|
||||
pipeline.append(labeller)
|
||||
self._multitasks.append(labeller)
|
||||
|
||||
def __reduce__(self):
|
||||
return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None)
|
||||
|
||||
|
@ -705,13 +796,13 @@ cdef class NeuralEntityRecognizer(NeuralParser):
|
|||
|
||||
nr_feature = 6
|
||||
|
||||
def predict_confidences(self, docs):
|
||||
tensors = [d.tensor for d in docs]
|
||||
samples = []
|
||||
for i in range(10):
|
||||
states = self.parse_batch(docs, tensors, drop=0.3)
|
||||
for state in states:
|
||||
samples.append(self._get_entities(state))
|
||||
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
|
||||
for target in []:
|
||||
labeller = NeuralLabeller(self.vocab, target=target)
|
||||
tok2vec = self.model[0]
|
||||
labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
|
||||
pipeline.append(labeller)
|
||||
self._multitasks.append(labeller)
|
||||
|
||||
def __reduce__(self):
|
||||
return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None)
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
cpdef enum symbol_t:
|
||||
cdef enum symbol_t:
|
||||
NIL
|
||||
IS_ALPHA
|
||||
IS_ASCII
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
# coding: utf8
|
||||
#cython: optimize.unpack_method_calls=False
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
IDS = {
|
||||
|
@ -458,4 +460,11 @@ IDS = {
|
|||
"xcomp": xcomp
|
||||
}
|
||||
|
||||
NAMES = [it[0] for it in sorted(IDS.items(), key=lambda it: it[1])]
|
||||
def sort_nums(x):
|
||||
return x[1]
|
||||
|
||||
NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)]
|
||||
# Unfortunate hack here, to work around problem with long cpdef enum
|
||||
# (which is generating an enormous amount of C++ in Cython 0.24+)
|
||||
# We keep the enum cdef, and just make sure the names are available to Python
|
||||
locals().update(IDS)
|
||||
|
|
|
@ -147,10 +147,10 @@ def get_token_ids(states, int n_tokens):
|
|||
|
||||
nr_update = 0
|
||||
def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
|
||||
states, tokvecs, golds,
|
||||
states, golds,
|
||||
state2vec, vec2scores,
|
||||
int width, float density,
|
||||
sgd=None, losses=None, drop=0.):
|
||||
losses=None, drop=0.):
|
||||
global nr_update
|
||||
cdef MaxViolation violn
|
||||
nr_update += 1
|
||||
|
|
|
@ -101,9 +101,10 @@ cdef cppclass StateC:
|
|||
elif n == 6:
|
||||
if this.B(0) >= 0:
|
||||
ids[0] = this.B(0)
|
||||
ids[1] = this.B(0)-1
|
||||
else:
|
||||
ids[0] = -1
|
||||
ids[1] = this.B(0)
|
||||
ids[1] = -1
|
||||
ids[2] = this.B(1)
|
||||
ids[3] = this.E(0)
|
||||
if ids[3] >= 1:
|
||||
|
@ -120,6 +121,8 @@ cdef cppclass StateC:
|
|||
for i in range(n):
|
||||
if ids[i] >= 0:
|
||||
ids[i] += this.offset
|
||||
else:
|
||||
ids[i] = -1
|
||||
|
||||
int S(int i) nogil const:
|
||||
if i >= this._s_i:
|
||||
|
@ -162,9 +165,9 @@ cdef cppclass StateC:
|
|||
|
||||
int E(int i) nogil const:
|
||||
if this._e_i <= 0 or this._e_i >= this.length:
|
||||
return 0
|
||||
return -1
|
||||
if i < 0 or i >= this._e_i:
|
||||
return 0
|
||||
return -1
|
||||
return this._ents[this._e_i - (i+1)].start
|
||||
|
||||
int L(int i, int idx) nogil const:
|
||||
|
|
|
@ -161,8 +161,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
cdef Transition lookup_transition(self, object name) except *:
|
||||
cdef attr_t label
|
||||
if name == '-' or name == None:
|
||||
move_str = 'M'
|
||||
label = 0
|
||||
return Transition(clas=0, move=MISSING, label=0, score=0)
|
||||
elif name == '!O':
|
||||
return Transition(clas=0, move=ISNT, label=0, score=0)
|
||||
elif '-' in name:
|
||||
|
@ -220,6 +219,31 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
raise Exception(move)
|
||||
return t
|
||||
|
||||
#def add_action(self, int action, label_name):
|
||||
# cdef attr_t label_id
|
||||
# if not isinstance(label_name, (int, long)):
|
||||
# label_id = self.strings.add(label_name)
|
||||
# else:
|
||||
# label_id = label_name
|
||||
# if action == OUT and label_id != 0:
|
||||
# return
|
||||
# if action == MISSING or action == ISNT:
|
||||
# return
|
||||
# # Check we're not creating a move we already have, so that this is
|
||||
# # idempotent
|
||||
# for trans in self.c[:self.n_moves]:
|
||||
# if trans.move == action and trans.label == label_id:
|
||||
# return 0
|
||||
# if self.n_moves >= self._size:
|
||||
# self._size *= 2
|
||||
# self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0]))
|
||||
# self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
|
||||
# assert self.c[self.n_moves].label == label_id
|
||||
# self.n_moves += 1
|
||||
# return 1
|
||||
|
||||
|
||||
|
||||
cdef int initialize_state(self, StateC* st) nogil:
|
||||
# This is especially necessary when we use limited training data.
|
||||
for i in range(st.length):
|
||||
|
|
|
@ -13,6 +13,7 @@ cdef class Parser:
|
|||
cdef public object model
|
||||
cdef readonly TransitionSystem moves
|
||||
cdef readonly object cfg
|
||||
cdef public object _multitasks
|
||||
|
||||
cdef void _parse_step(self, StateC* state,
|
||||
const float* feat_weights,
|
||||
|
|
|
@ -7,6 +7,7 @@ from __future__ import unicode_literals, print_function
|
|||
|
||||
from collections import Counter, OrderedDict
|
||||
import ujson
|
||||
import json
|
||||
import contextlib
|
||||
|
||||
from libc.math cimport exp
|
||||
|
@ -37,10 +38,9 @@ from preshed.maps cimport MapStruct
|
|||
from preshed.maps cimport map_get
|
||||
|
||||
from thinc.api import layerize, chain, noop, clone, with_flatten
|
||||
from thinc.neural import Model, Affine, ReLu, Maxout
|
||||
from thinc.neural._classes.batchnorm import BatchNorm as BN
|
||||
from thinc.neural._classes.selu import SELU
|
||||
from thinc.neural._classes.layernorm import LayerNorm
|
||||
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
|
||||
from thinc.misc import LayerNorm
|
||||
|
||||
from thinc.neural.ops import NumpyOps, CupyOps
|
||||
from thinc.neural.util import get_array_module
|
||||
|
||||
|
@ -48,7 +48,8 @@ from .. import util
|
|||
from ..util import get_async, get_cuda_stream
|
||||
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
|
||||
from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune
|
||||
from .._ml import Residual, drop_layer
|
||||
from .._ml import Residual, drop_layer, flatten
|
||||
from .._ml import link_vectors_to_models
|
||||
from ..compat import json_dumps
|
||||
|
||||
from . import _parse_features
|
||||
|
@ -238,14 +239,15 @@ cdef class Parser:
|
|||
Base class of the DependencyParser and EntityRecognizer.
|
||||
"""
|
||||
@classmethod
|
||||
def Model(cls, nr_class, token_vector_width=128, hidden_width=300, depth=1, **cfg):
|
||||
def Model(cls, nr_class, token_vector_width=128, hidden_width=200, depth=1, **cfg):
|
||||
depth = util.env_opt('parser_hidden_depth', depth)
|
||||
token_vector_width = util.env_opt('token_vector_width', token_vector_width)
|
||||
hidden_width = util.env_opt('hidden_width', hidden_width)
|
||||
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
|
||||
embed_size = util.env_opt('embed_size', 4000)
|
||||
tensors = fine_tune(Tok2Vec(token_vector_width, embed_size,
|
||||
preprocess=doc2feats()))
|
||||
embed_size = util.env_opt('embed_size', 7000)
|
||||
tok2vec = Tok2Vec(token_vector_width, embed_size,
|
||||
pretrained_dims=cfg.get('pretrained_dims', 0))
|
||||
tok2vec = chain(tok2vec, flatten)
|
||||
if parser_maxout_pieces == 1:
|
||||
lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
|
||||
nF=cls.nr_feature,
|
||||
|
@ -262,8 +264,8 @@ cdef class Parser:
|
|||
upper.is_noop = True
|
||||
else:
|
||||
upper = chain(
|
||||
clone(Maxout(hidden_width), (depth-1)),
|
||||
zero_init(Affine(nr_class, drop_factor=0.0))
|
||||
clone(Maxout(hidden_width), depth-1),
|
||||
zero_init(Affine(nr_class, hidden_width, drop_factor=0.0))
|
||||
)
|
||||
upper.is_noop = False
|
||||
# TODO: This is an unfortunate hack atm!
|
||||
|
@ -277,7 +279,7 @@ cdef class Parser:
|
|||
'hidden_width': hidden_width,
|
||||
'maxout_pieces': parser_maxout_pieces
|
||||
}
|
||||
return (tensors, lower, upper), cfg
|
||||
return (tok2vec, lower, upper), cfg
|
||||
|
||||
def __init__(self, Vocab vocab, moves=True, model=True, **cfg):
|
||||
"""
|
||||
|
@ -307,12 +309,16 @@ cdef class Parser:
|
|||
cfg['beam_width'] = util.env_opt('beam_width', 1)
|
||||
if 'beam_density' not in cfg:
|
||||
cfg['beam_density'] = util.env_opt('beam_density', 0.0)
|
||||
if 'pretrained_dims' not in cfg:
|
||||
cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
||||
cfg.setdefault('cnn_maxout_pieces', 3)
|
||||
self.cfg = cfg
|
||||
if 'actions' in self.cfg:
|
||||
for action, labels in self.cfg.get('actions', {}).items():
|
||||
for label in labels:
|
||||
self.moves.add_action(action, label)
|
||||
self.model = model
|
||||
self._multitasks = []
|
||||
|
||||
def __reduce__(self):
|
||||
return (Parser, (self.vocab, self.moves, self.model), None, None)
|
||||
|
@ -332,11 +338,11 @@ cdef class Parser:
|
|||
beam_density = self.cfg.get('beam_density', 0.0)
|
||||
cdef Beam beam
|
||||
if beam_width == 1:
|
||||
states = self.parse_batch([doc], [doc.tensor])
|
||||
states = self.parse_batch([doc])
|
||||
self.set_annotations([doc], states)
|
||||
return doc
|
||||
else:
|
||||
beam = self.beam_parse([doc], [doc.tensor],
|
||||
beam = self.beam_parse([doc],
|
||||
beam_width=beam_width, beam_density=beam_density)[0]
|
||||
output = self.moves.get_beam_annot(beam)
|
||||
state = <StateClass>beam.at(0)
|
||||
|
@ -365,11 +371,11 @@ cdef class Parser:
|
|||
cdef Beam beam
|
||||
for docs in cytoolz.partition_all(batch_size, docs):
|
||||
docs = list(docs)
|
||||
tokvecs = [doc.tensor for doc in docs]
|
||||
if beam_width == 1:
|
||||
parse_states = self.parse_batch(docs, tokvecs)
|
||||
parse_states = self.parse_batch(docs)
|
||||
beams = []
|
||||
else:
|
||||
beams = self.beam_parse(docs, tokvecs,
|
||||
beams = self.beam_parse(docs,
|
||||
beam_width=beam_width, beam_density=beam_density)
|
||||
parse_states = []
|
||||
for beam in beams:
|
||||
|
@ -377,7 +383,7 @@ cdef class Parser:
|
|||
self.set_annotations(docs, parse_states)
|
||||
yield from docs
|
||||
|
||||
def parse_batch(self, docs, tokvecses):
|
||||
def parse_batch(self, docs):
|
||||
cdef:
|
||||
precompute_hiddens state2vec
|
||||
StateClass state
|
||||
|
@ -388,21 +394,15 @@ cdef class Parser:
|
|||
int nr_class, nr_feat, nr_piece, nr_dim, nr_state
|
||||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
if isinstance(tokvecses, np.ndarray):
|
||||
tokvecses = [tokvecses]
|
||||
|
||||
tokvecs = self.model[0].ops.flatten(tokvecses)
|
||||
if USE_FINE_TUNE:
|
||||
tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
|
||||
cuda_stream = get_cuda_stream()
|
||||
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
|
||||
0.0)
|
||||
|
||||
nr_state = len(docs)
|
||||
nr_class = self.moves.n_moves
|
||||
nr_dim = tokvecs.shape[1]
|
||||
nr_feat = self.nr_feature
|
||||
|
||||
cuda_stream = get_cuda_stream()
|
||||
state2vec, vec2scores = self.get_batch_model(nr_state, tokvecs,
|
||||
cuda_stream, 0.0)
|
||||
nr_piece = state2vec.nP
|
||||
|
||||
states = self.moves.init_batch(docs)
|
||||
|
@ -418,21 +418,23 @@ cdef class Parser:
|
|||
c_token_ids = <int*>token_ids.data
|
||||
c_is_valid = <int*>is_valid.data
|
||||
cdef int has_hidden = not getattr(vec2scores, 'is_noop', False)
|
||||
cdef int nr_step
|
||||
while not next_step.empty():
|
||||
nr_step = next_step.size()
|
||||
if not has_hidden:
|
||||
for i in cython.parallel.prange(
|
||||
next_step.size(), num_threads=6, nogil=True):
|
||||
for i in cython.parallel.prange(nr_step, num_threads=6,
|
||||
nogil=True):
|
||||
self._parse_step(next_step[i],
|
||||
feat_weights, nr_class, nr_feat, nr_piece)
|
||||
else:
|
||||
for i in range(next_step.size()):
|
||||
for i in range(nr_step):
|
||||
st = next_step[i]
|
||||
st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat)
|
||||
self.moves.set_valid(&c_is_valid[i*nr_class], st)
|
||||
vectors = state2vec(token_ids[:next_step.size()])
|
||||
scores = vec2scores(vectors)
|
||||
c_scores = <float*>scores.data
|
||||
for i in range(next_step.size()):
|
||||
for i in range(nr_step):
|
||||
st = next_step[i]
|
||||
guess = arg_max_if_valid(
|
||||
&c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class)
|
||||
|
@ -445,18 +447,15 @@ cdef class Parser:
|
|||
next_step.push_back(st)
|
||||
return states
|
||||
|
||||
def beam_parse(self, docs, tokvecses, int beam_width=3, float beam_density=0.001):
|
||||
def beam_parse(self, docs, int beam_width=3, float beam_density=0.001):
|
||||
cdef Beam beam
|
||||
cdef np.ndarray scores
|
||||
cdef Doc doc
|
||||
cdef int nr_class = self.moves.n_moves
|
||||
cdef StateClass stcls, output
|
||||
tokvecs = self.model[0].ops.flatten(tokvecses)
|
||||
if USE_FINE_TUNE:
|
||||
tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
|
||||
cuda_stream = get_cuda_stream()
|
||||
state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs,
|
||||
cuda_stream, 0.0)
|
||||
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
|
||||
0.0)
|
||||
beams = []
|
||||
cdef int offset = 0
|
||||
cdef int j = 0
|
||||
|
@ -516,29 +515,24 @@ cdef class Parser:
|
|||
free(scores)
|
||||
free(token_ids)
|
||||
|
||||
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
|
||||
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
||||
if not any(self.moves.has_gold(gold) for gold in golds):
|
||||
return None
|
||||
if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5:
|
||||
return self.update_beam(docs_tokvecs, golds,
|
||||
return self.update_beam(docs, golds,
|
||||
self.cfg['beam_width'], self.cfg['beam_density'],
|
||||
drop=drop, sgd=sgd, losses=losses)
|
||||
if losses is not None and self.name not in losses:
|
||||
losses[self.name] = 0.
|
||||
docs, tokvec_lists = docs_tokvecs
|
||||
tokvecs = self.model[0].ops.flatten(tokvec_lists)
|
||||
if isinstance(docs, Doc) and isinstance(golds, GoldParse):
|
||||
docs = [docs]
|
||||
golds = [golds]
|
||||
if USE_FINE_TUNE:
|
||||
my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
|
||||
tokvecs = self.model[0].ops.flatten(my_tokvecs)
|
||||
|
||||
cuda_stream = get_cuda_stream()
|
||||
|
||||
states, golds, max_steps = self._init_gold_batch(docs, golds)
|
||||
state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream,
|
||||
0.0)
|
||||
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
|
||||
drop)
|
||||
todo = [(s, g) for (s, g) in zip(states, golds)
|
||||
if not s.is_final() and g is not None]
|
||||
if not todo:
|
||||
|
@ -582,13 +576,9 @@ cdef class Parser:
|
|||
if n_steps >= max_steps:
|
||||
break
|
||||
self._make_updates(d_tokvecs,
|
||||
backprops, sgd, cuda_stream)
|
||||
d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
|
||||
if USE_FINE_TUNE:
|
||||
d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
|
||||
return d_tokvecs
|
||||
bp_tokvecs, backprops, sgd, cuda_stream)
|
||||
|
||||
def update_beam(self, docs_tokvecs, golds, width=None, density=None,
|
||||
def update_beam(self, docs, golds, width=None, density=None,
|
||||
drop=0., sgd=None, losses=None):
|
||||
if not any(self.moves.has_gold(gold) for gold in golds):
|
||||
return None
|
||||
|
@ -600,26 +590,20 @@ cdef class Parser:
|
|||
density = self.cfg.get('beam_density', 0.0)
|
||||
if losses is not None and self.name not in losses:
|
||||
losses[self.name] = 0.
|
||||
docs, tokvecs = docs_tokvecs
|
||||
lengths = [len(d) for d in docs]
|
||||
assert min(lengths) >= 1
|
||||
tokvecs = self.model[0].ops.flatten(tokvecs)
|
||||
if USE_FINE_TUNE:
|
||||
my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
|
||||
tokvecs += self.model[0].ops.flatten(my_tokvecs)
|
||||
|
||||
states = self.moves.init_batch(docs)
|
||||
for gold in golds:
|
||||
self.moves.preprocess_gold(gold)
|
||||
|
||||
cuda_stream = get_cuda_stream()
|
||||
state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, 0.0)
|
||||
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, drop)
|
||||
|
||||
states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500,
|
||||
states, tokvecs, golds,
|
||||
states, golds,
|
||||
state2vec, vec2scores,
|
||||
width, density,
|
||||
sgd=sgd, drop=drop, losses=losses)
|
||||
drop=drop, losses=losses)
|
||||
backprop_lower = []
|
||||
cdef float batch_size = len(docs)
|
||||
for i, d_scores in enumerate(states_d_scores):
|
||||
|
@ -637,11 +621,7 @@ cdef class Parser:
|
|||
else:
|
||||
backprop_lower.append((ids, d_vector, bp_vectors))
|
||||
d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
|
||||
self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream)
|
||||
d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, lengths)
|
||||
if USE_FINE_TUNE:
|
||||
d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
|
||||
return d_tokvecs
|
||||
self._make_updates(d_tokvecs, bp_tokvecs, backprop_lower, sgd, cuda_stream)
|
||||
|
||||
def _init_gold_batch(self, whole_docs, whole_golds):
|
||||
"""Make a square batch, of length equal to the shortest doc. A long
|
||||
|
@ -679,7 +659,7 @@ cdef class Parser:
|
|||
max_moves = max(max_moves, len(oracle_actions))
|
||||
return states, golds, max_moves
|
||||
|
||||
def _make_updates(self, d_tokvecs, backprops, sgd, cuda_stream=None):
|
||||
def _make_updates(self, d_tokvecs, bp_tokvecs, backprops, sgd, cuda_stream=None):
|
||||
# Tells CUDA to block, so our async copies complete.
|
||||
if cuda_stream is not None:
|
||||
cuda_stream.synchronize()
|
||||
|
@ -690,6 +670,7 @@ cdef class Parser:
|
|||
d_state_features *= mask.reshape(ids.shape + (1,))
|
||||
self.model[0].ops.scatter_add(d_tokvecs, ids * mask,
|
||||
d_state_features)
|
||||
bp_tokvecs(d_tokvecs, sgd=sgd)
|
||||
|
||||
@property
|
||||
def move_names(self):
|
||||
|
@ -699,11 +680,12 @@ cdef class Parser:
|
|||
names.append(name)
|
||||
return names
|
||||
|
||||
def get_batch_model(self, batch_size, tokvecs, stream, dropout):
|
||||
_, lower, upper = self.model
|
||||
state2vec = precompute_hiddens(batch_size, tokvecs,
|
||||
lower, stream, drop=dropout)
|
||||
return state2vec, upper
|
||||
def get_batch_model(self, docs, stream, dropout):
|
||||
tok2vec, lower, upper = self.model
|
||||
tokvecs, bp_tokvecs = tok2vec.begin_update(docs, drop=dropout)
|
||||
state2vec = precompute_hiddens(len(docs), tokvecs,
|
||||
lower, stream, drop=0.0)
|
||||
return (tokvecs, bp_tokvecs), state2vec, upper
|
||||
|
||||
nr_feature = 8
|
||||
|
||||
|
@ -766,7 +748,7 @@ cdef class Parser:
|
|||
# order, or the model goes out of synch
|
||||
self.cfg.setdefault('extra_labels', []).append(label)
|
||||
|
||||
def begin_training(self, gold_tuples, **cfg):
|
||||
def begin_training(self, gold_tuples, pipeline=None, **cfg):
|
||||
if 'model' in cfg:
|
||||
self.model = cfg['model']
|
||||
gold_tuples = nonproj.preprocess_training_data(gold_tuples)
|
||||
|
@ -775,9 +757,22 @@ cdef class Parser:
|
|||
for label in labels:
|
||||
self.moves.add_action(action, label)
|
||||
if self.model is True:
|
||||
cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||
self.model, cfg = self.Model(self.moves.n_moves, **cfg)
|
||||
self.init_multitask_objectives(gold_tuples, pipeline, **cfg)
|
||||
link_vectors_to_models(self.vocab)
|
||||
self.cfg.update(cfg)
|
||||
|
||||
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
|
||||
'''Setup models for secondary objectives, to benefit from multi-task
|
||||
learning. This method is intended to be overridden by subclasses.
|
||||
|
||||
For instance, the dependency parser can benefit from sharing
|
||||
an input representation with a label prediction model. These auxiliary
|
||||
models are discarded after training.
|
||||
'''
|
||||
pass
|
||||
|
||||
def preprocess_gold(self, docs_golds):
|
||||
for doc, gold in docs_golds:
|
||||
yield doc, gold
|
||||
|
@ -813,6 +808,7 @@ cdef class Parser:
|
|||
if 'model' not in exclude:
|
||||
path = util.ensure_path(path)
|
||||
if self.model is True:
|
||||
self.cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||
self.model, cfg = self.Model(**self.cfg)
|
||||
else:
|
||||
cfg = {}
|
||||
|
@ -835,7 +831,7 @@ cdef class Parser:
|
|||
('upper_model', lambda: self.model[2].to_bytes()),
|
||||
('vocab', lambda: self.vocab.to_bytes()),
|
||||
('moves', lambda: self.moves.to_bytes(strings=False)),
|
||||
('cfg', lambda: ujson.dumps(self.cfg))
|
||||
('cfg', lambda: json.dumps(self.cfg, indent=2, sort_keys=True))
|
||||
))
|
||||
if 'model' in exclude:
|
||||
exclude['tok2vec_model'] = True
|
||||
|
@ -848,7 +844,7 @@ cdef class Parser:
|
|||
deserializers = OrderedDict((
|
||||
('vocab', lambda b: self.vocab.from_bytes(b)),
|
||||
('moves', lambda b: self.moves.from_bytes(b, strings=False)),
|
||||
('cfg', lambda b: self.cfg.update(ujson.loads(b))),
|
||||
('cfg', lambda b: self.cfg.update(json.loads(b))),
|
||||
('tok2vec_model', lambda b: None),
|
||||
('lower_model', lambda b: None),
|
||||
('upper_model', lambda b: None)
|
||||
|
@ -856,9 +852,11 @@ cdef class Parser:
|
|||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
||||
if 'model' not in exclude:
|
||||
if self.model is True:
|
||||
self.model, cfg = self.Model(self.moves.n_moves)
|
||||
self.model, cfg = self.Model(**self.cfg)
|
||||
cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||
else:
|
||||
cfg = {}
|
||||
cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||
if 'tok2vec_model' in msg:
|
||||
self.model[0].from_bytes(msg['tok2vec_model'])
|
||||
if 'lower_model' in msg:
|
||||
|
|
|
@ -148,7 +148,7 @@ cdef class TransitionSystem:
|
|||
|
||||
def add_action(self, int action, label_name):
|
||||
cdef attr_t label_id
|
||||
if not isinstance(label_name, int):
|
||||
if not isinstance(label_name, (int, long)):
|
||||
label_id = self.strings.add(label_name)
|
||||
else:
|
||||
label_id = label_name
|
||||
|
|
|
@ -12,7 +12,7 @@ from .. import util
|
|||
|
||||
|
||||
_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'id',
|
||||
'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx']
|
||||
'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'th','xx']
|
||||
_models = {'en': ['en_core_web_sm'],
|
||||
'de': ['de_core_news_md'],
|
||||
'fr': ['fr_depvec_web_lg'],
|
||||
|
@ -108,6 +108,11 @@ def he_tokenizer():
|
|||
def nb_tokenizer():
|
||||
return util.get_lang_class('nb').Defaults.create_tokenizer()
|
||||
|
||||
@pytest.fixture
|
||||
def th_tokenizer():
|
||||
pythainlp = pytest.importorskip("pythainlp")
|
||||
return util.get_lang_class('th').Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def stringstore():
|
||||
|
|
|
@ -67,12 +67,6 @@ def test_tokenizer_splits_uneven_wrap_interact(de_tokenizer, text):
|
|||
assert len(tokens) == 4
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["blau-rot"])
|
||||
def test_tokenizer_splits_hyphens(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"])
|
||||
def test_tokenizer_splits_numeric_range(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
|
@ -100,17 +94,21 @@ def test_tokenizer_splits_ellipsis_infix(de_tokenizer, text):
|
|||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ['Islam-Konferenz', 'Ost-West-Konflikt'])
|
||||
def test_tokenizer_keeps_hyphens(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
||||
|
||||
def test_tokenizer_splits_double_hyphen_infix(de_tokenizer):
|
||||
tokens = de_tokenizer("Viele Regeln--wie die Bindestrich-Regeln--sind kompliziert.")
|
||||
assert len(tokens) == 12
|
||||
assert len(tokens) == 10
|
||||
assert tokens[0].text == "Viele"
|
||||
assert tokens[1].text == "Regeln"
|
||||
assert tokens[2].text == "--"
|
||||
assert tokens[3].text == "wie"
|
||||
assert tokens[4].text == "die"
|
||||
assert tokens[5].text == "Bindestrich"
|
||||
assert tokens[6].text == "-"
|
||||
assert tokens[7].text == "Regeln"
|
||||
assert tokens[8].text == "--"
|
||||
assert tokens[9].text == "sind"
|
||||
assert tokens[10].text == "kompliziert"
|
||||
assert tokens[5].text == "Bindestrich-Regeln"
|
||||
assert tokens[6].text == "--"
|
||||
assert tokens[7].text == "sind"
|
||||
assert tokens[8].text == "kompliziert"
|
||||
|
|
|
@ -25,15 +25,15 @@ Umfang kläglich dünnen Beine flimmerten ihm hilflos vor den Augen.
|
|||
assert len(tokens) == 109
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,length', [
|
||||
("Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten", 1),
|
||||
("Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz", 1),
|
||||
("Kraftfahrzeug-Haftpflichtversicherung", 3),
|
||||
("Vakuum-Mittelfrequenz-Induktionsofen", 5)
|
||||
@pytest.mark.parametrize('text', [
|
||||
"Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten",
|
||||
"Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz",
|
||||
"Kraftfahrzeug-Haftpflichtversicherung",
|
||||
"Vakuum-Mittelfrequenz-Induktionsofen"
|
||||
])
|
||||
def test_tokenizer_handles_long_words(de_tokenizer, text, length):
|
||||
def test_tokenizer_handles_long_words(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
assert len(tokens) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,length', [
|
||||
|
|
0
spacy/tests/lang/th/__init__.py
Normal file
0
spacy/tests/lang/th/__init__.py
Normal file
13
spacy/tests/lang/th/test_tokenizer.py
Normal file
13
spacy/tests/lang/th/test_tokenizer.py
Normal file
|
@ -0,0 +1,13 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
TOKENIZER_TESTS = [
|
||||
("คุณรักผมไหม", ['คุณ', 'รัก', 'ผม', 'ไหม'])
|
||||
]
|
||||
|
||||
@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS)
|
||||
def test_thai_tokenizer(th_tokenizer, text, expected_tokens):
|
||||
tokens = [token.text for token in th_tokenizer(text)]
|
||||
assert tokens == expected_tokens
|
|
@ -26,7 +26,7 @@ def arc_eager(vocab):
|
|||
|
||||
@pytest.fixture
|
||||
def tok2vec():
|
||||
return Tok2Vec(8, 100, preprocess=doc2feats())
|
||||
return Tok2Vec(8, 100)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -61,33 +61,22 @@ def test_predict_doc(parser, tok2vec, model, doc):
|
|||
parser(doc)
|
||||
|
||||
|
||||
def test_update_doc(parser, tok2vec, model, doc, gold):
|
||||
def test_update_doc(parser, model, doc, gold):
|
||||
parser.model = model
|
||||
tokvecs, bp_tokvecs = tok2vec.begin_update([doc])
|
||||
d_tokvecs = parser.update(([doc], tokvecs), [gold])
|
||||
assert d_tokvecs[0].shape == tokvecs[0].shape
|
||||
def optimize(weights, gradient, key=None):
|
||||
weights -= 0.001 * gradient
|
||||
bp_tokvecs(d_tokvecs, sgd=optimize)
|
||||
assert d_tokvecs[0].sum() == 0.
|
||||
parser.update([doc], [gold], sgd=optimize)
|
||||
|
||||
|
||||
def test_predict_doc_beam(parser, tok2vec, model, doc):
|
||||
doc.tensor = tok2vec([doc])[0]
|
||||
def test_predict_doc_beam(parser, model, doc):
|
||||
parser.model = model
|
||||
parser(doc, beam_width=32, beam_density=0.001)
|
||||
for word in doc:
|
||||
print(word.text, word.head, word.dep_)
|
||||
|
||||
|
||||
def test_update_doc_beam(parser, tok2vec, model, doc, gold):
|
||||
def test_update_doc_beam(parser, model, doc, gold):
|
||||
parser.model = model
|
||||
tokvecs, bp_tokvecs = tok2vec.begin_update([doc])
|
||||
d_tokvecs = parser.update_beam(([doc], tokvecs), [gold])
|
||||
assert d_tokvecs[0].shape == tokvecs[0].shape
|
||||
def optimize(weights, gradient, key=None):
|
||||
weights -= 0.001 * gradient
|
||||
bp_tokvecs(d_tokvecs, sgd=optimize)
|
||||
assert d_tokvecs[0].sum() == 0.
|
||||
parser.update_beam([doc], [gold], sgd=optimize)
|
||||
|
||||
|
||||
|
|
8
spacy/tests/regression/test_issue1305.py
Normal file
8
spacy/tests/regression/test_issue1305.py
Normal file
|
@ -0,0 +1,8 @@
|
|||
import pytest
|
||||
|
||||
@pytest.mark.models('en')
|
||||
def test_issue1305(EN):
|
||||
'''Test lemmatization of English VBZ'''
|
||||
assert EN.vocab.morphology.lemmatizer('works', 'verb') == set(['work'])
|
||||
doc = EN(u'This app works well')
|
||||
assert doc[2].lemma_ == 'work'
|
14
spacy/tests/regression/test_issue1380.py
Normal file
14
spacy/tests/regression/test_issue1380.py
Normal file
|
@ -0,0 +1,14 @@
|
|||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
|
||||
from ...language import Language
|
||||
|
||||
def test_issue1380_empty_string():
|
||||
nlp = Language()
|
||||
doc = nlp('')
|
||||
assert len(doc) == 0
|
||||
|
||||
@pytest.mark.models('en')
|
||||
def test_issue1380_en(EN):
|
||||
doc = EN('')
|
||||
assert len(doc) == 0
|
|
@ -13,7 +13,10 @@ def test_issue429(EN):
|
|||
return None
|
||||
spans = [(ent_id, ent_id, doc[start:end]) for ent_id, start, end in matches]
|
||||
for ent_id, label, span in spans:
|
||||
span.merge('NNP' if label else span.root.tag_, span.text, EN.vocab.strings[label])
|
||||
span.merge(
|
||||
tag=('NNP' if label else span.root.tag_),
|
||||
lemma=span.text,
|
||||
label='PERSON')
|
||||
|
||||
doc = EN('a')
|
||||
matcher = Matcher(EN.vocab)
|
||||
|
|
|
@ -11,7 +11,7 @@ import pytest
|
|||
def taggers(en_vocab):
|
||||
tagger1 = Tagger(en_vocab)
|
||||
tagger2 = Tagger(en_vocab)
|
||||
tagger1.model = tagger1.Model(8, 8)
|
||||
tagger1.model = tagger1.Model(8)
|
||||
tagger2.model = tagger1.model
|
||||
return (tagger1, tagger2)
|
||||
|
||||
|
|
|
@ -6,6 +6,16 @@ from ...strings import StringStore
|
|||
import pytest
|
||||
|
||||
|
||||
def test_string_hash(stringstore):
|
||||
'''Test that string hashing is stable across platforms'''
|
||||
ss = stringstore
|
||||
assert ss.add('apple') == 8566208034543834098
|
||||
heart = '\U0001f499'
|
||||
print(heart)
|
||||
h = ss.add(heart)
|
||||
assert h == 11841826740069053588
|
||||
|
||||
|
||||
def test_stringstore_from_api_docs(stringstore):
|
||||
apple_hash = stringstore.add('apple')
|
||||
assert apple_hash == 8566208034543834098
|
||||
|
|
|
@ -34,7 +34,6 @@ def test_matcher_from_api_docs(en_vocab):
|
|||
assert len(patterns[0])
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_matcher_from_usage_docs(en_vocab):
|
||||
text = "Wow 😀 This is really cool! 😂 😂"
|
||||
doc = get_doc(en_vocab, words=text.split(' '))
|
||||
|
@ -46,7 +45,8 @@ def test_matcher_from_usage_docs(en_vocab):
|
|||
if doc.vocab.strings[match_id] == 'HAPPY':
|
||||
doc.sentiment += 0.1
|
||||
span = doc[start : end]
|
||||
token = span.merge(norm='happy emoji')
|
||||
token = span.merge()
|
||||
token.vocab[token.text].norm_ = 'happy emoji'
|
||||
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add('HAPPY', label_sentiment, *pos_patterns)
|
||||
|
@ -98,11 +98,11 @@ def test_matcher_match_multi(matcher):
|
|||
(doc.vocab.strings['Java'], 5, 6)]
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_matcher_phrase_matcher(en_vocab):
|
||||
words = ["Google", "Now"]
|
||||
doc = get_doc(en_vocab, words)
|
||||
matcher = PhraseMatcher(en_vocab, [doc])
|
||||
matcher = PhraseMatcher(en_vocab)
|
||||
matcher.add('COMPANY', None, doc)
|
||||
words = ["I", "like", "Google", "Now", "best"]
|
||||
doc = get_doc(en_vocab, words)
|
||||
assert len(matcher(doc)) == 1
|
||||
|
|
|
@ -9,7 +9,8 @@ from .util import get_doc
|
|||
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
from thinc.neural import Maxout, Softmax
|
||||
from thinc.neural._classes.maxout import Maxout
|
||||
from thinc.neural._classes.softmax import Softmax
|
||||
from thinc.api import chain
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import sys
|
||||
import pytest
|
||||
|
||||
|
||||
|
@ -37,9 +38,10 @@ def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
|
|||
tokens = tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8),
|
||||
('i💙you', 3), ('🤘🤘yay!', 4)])
|
||||
def test_tokenizer_handles_emoji(tokenizer, text, length):
|
||||
# These break on narrow unicode builds, e.g. Windows
|
||||
if sys.maxunicode >= 1114111:
|
||||
tokens = tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
|
|
|
@ -54,7 +54,7 @@ cdef class Doc:
|
|||
|
||||
cdef public object noun_chunks_iterator
|
||||
|
||||
cdef int push_back(self, LexemeOrToken lex_or_tok, bint trailing_space) except -1
|
||||
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1
|
||||
|
||||
cpdef np.ndarray to_array(self, object features)
|
||||
|
||||
|
|
|
@ -660,7 +660,7 @@ cdef class Doc:
|
|||
"""
|
||||
with path.open('rb') as file_:
|
||||
bytes_data = file_.read()
|
||||
self.from_bytes(bytes_data, **exclude)
|
||||
return self.from_bytes(bytes_data, **exclude)
|
||||
|
||||
def to_bytes(self, **exclude):
|
||||
"""Serialize, i.e. export the document contents to a binary string.
|
||||
|
|
|
@ -3,7 +3,7 @@ from __future__ import unicode_literals, print_function
|
|||
|
||||
import os
|
||||
import ujson
|
||||
import pip
|
||||
import pkg_resources
|
||||
import importlib
|
||||
import regex as re
|
||||
from pathlib import Path
|
||||
|
@ -14,6 +14,7 @@ import numpy
|
|||
import io
|
||||
import dill
|
||||
from collections import OrderedDict
|
||||
from thinc.neural._classes.model import Model
|
||||
|
||||
import msgpack
|
||||
import msgpack_numpy
|
||||
|
@ -180,9 +181,10 @@ def is_package(name):
|
|||
name (unicode): Name of package.
|
||||
RETURNS (bool): True if installed package, False if not.
|
||||
"""
|
||||
packages = pip.get_installed_distributions()
|
||||
name = name.lower() # compare package name against lowercase name
|
||||
packages = pkg_resources.working_set.by_key.keys()
|
||||
for package in packages:
|
||||
if package.project_name.replace('-', '_') == name:
|
||||
if package.lower().replace('-', '_') == name:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
@ -193,6 +195,7 @@ def get_package_path(name):
|
|||
name (unicode): Package name.
|
||||
RETURNS (Path): Path to installed package.
|
||||
"""
|
||||
name = name.lower() # use lowercase version to be safe
|
||||
# Here we're importing the module just to find it. This is worryingly
|
||||
# indirect, but it's otherwise very difficult to find the package.
|
||||
pkg = importlib.import_module(name)
|
||||
|
@ -557,3 +560,17 @@ def minify_html(html):
|
|||
RETURNS (unicode): "Minified" HTML.
|
||||
"""
|
||||
return html.strip().replace(' ', '').replace('\n', '')
|
||||
|
||||
|
||||
def use_gpu(gpu_id):
|
||||
try:
|
||||
import cupy.cuda.device
|
||||
except ImportError:
|
||||
return None
|
||||
from thinc.neural.ops import CupyOps
|
||||
device = cupy.cuda.device.Device(gpu_id)
|
||||
device.use()
|
||||
Model.ops = CupyOps()
|
||||
Model.Ops = CupyOps
|
||||
return device
|
||||
|
||||
|
|
|
@ -6,6 +6,8 @@ import msgpack
|
|||
import msgpack_numpy
|
||||
msgpack_numpy.patch()
|
||||
cimport numpy as np
|
||||
from thinc.neural.util import get_array_module
|
||||
from thinc.neural._classes.model import Model
|
||||
|
||||
from .typedefs cimport attr_t
|
||||
from .strings cimport StringStore
|
||||
|
@ -14,15 +16,29 @@ from .compat import basestring_
|
|||
|
||||
|
||||
cdef class Vectors:
|
||||
'''Store, save and load word vectors.'''
|
||||
'''Store, save and load word vectors.
|
||||
|
||||
Vectors data is kept in the vectors.data attribute, which should be an
|
||||
instance of numpy.ndarray (for CPU vectors)
|
||||
or cupy.ndarray (for GPU vectors).
|
||||
|
||||
vectors.key2row is a dictionary mapping word hashes to rows
|
||||
in the vectors.data table. The array `vectors.keys` keeps
|
||||
the keys in order, such that keys[vectors.key2row[key]] == key.
|
||||
'''
|
||||
cdef public object data
|
||||
cdef readonly StringStore strings
|
||||
cdef public object key2row
|
||||
cdef public object keys
|
||||
cdef public int i
|
||||
|
||||
def __init__(self, strings, data_or_width):
|
||||
def __init__(self, strings, data_or_width=0):
|
||||
if isinstance(strings, StringStore):
|
||||
self.strings = strings
|
||||
else:
|
||||
self.strings = StringStore()
|
||||
for string in strings:
|
||||
self.strings.add(string)
|
||||
if isinstance(data_or_width, int):
|
||||
self.data = data = numpy.zeros((len(strings), data_or_width),
|
||||
dtype='f')
|
||||
|
@ -37,6 +53,11 @@ cdef class Vectors:
|
|||
return (Vectors, (self.strings, self.data))
|
||||
|
||||
def __getitem__(self, key):
|
||||
'''Get a vector by key. If key is a string, it is hashed
|
||||
to an integer ID using the vectors.strings table.
|
||||
|
||||
If the integer key is not found in the table, a KeyError is raised.
|
||||
'''
|
||||
if isinstance(key, basestring):
|
||||
key = self.strings[key]
|
||||
i = self.key2row[key]
|
||||
|
@ -46,23 +67,30 @@ cdef class Vectors:
|
|||
return self.data[i]
|
||||
|
||||
def __setitem__(self, key, vector):
|
||||
'''Set a vector for the given key. If key is a string, it is hashed
|
||||
to an integer ID using the vectors.strings table.
|
||||
'''
|
||||
if isinstance(key, basestring):
|
||||
key = self.strings.add(key)
|
||||
i = self.key2row[key]
|
||||
self.data[i] = vector
|
||||
|
||||
def __iter__(self):
|
||||
'''Yield vectors from the table.'''
|
||||
yield from self.data
|
||||
|
||||
def __len__(self):
|
||||
'''Return the number of vectors that have been assigned.'''
|
||||
return self.i
|
||||
|
||||
def __contains__(self, key):
|
||||
'''Check whether a key has a vector entry in the table.'''
|
||||
if isinstance(key, basestring_):
|
||||
key = self.strings[key]
|
||||
return key in self.key2row
|
||||
|
||||
def add(self, key, vector=None):
|
||||
'''Add a key to the table, optionally setting a vector value as well.'''
|
||||
if isinstance(key, basestring_):
|
||||
key = self.strings.add(key)
|
||||
if key not in self.key2row:
|
||||
|
@ -80,7 +108,9 @@ cdef class Vectors:
|
|||
return i
|
||||
|
||||
def items(self):
|
||||
for i, string in enumerate(self.strings):
|
||||
'''Iterate over (string key, vector) pairs, in order.'''
|
||||
for i, key in enumerate(self.keys):
|
||||
string = self.strings[key]
|
||||
yield string, self.data[i]
|
||||
|
||||
@property
|
||||
|
@ -118,9 +148,14 @@ cdef class Vectors:
|
|||
self.data
|
||||
|
||||
def to_disk(self, path, **exclude):
|
||||
xp = get_array_module(self.data)
|
||||
if xp is numpy:
|
||||
save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False)
|
||||
else:
|
||||
save_array = lambda arr, file_: xp.save(file_, arr)
|
||||
serializers = OrderedDict((
|
||||
('vectors', lambda p: numpy.save(p.open('wb'), self.data, allow_pickle=False)),
|
||||
('keys', lambda p: numpy.save(p.open('wb'), self.keys, allow_pickle=False)),
|
||||
('vectors', lambda p: save_array(self.data, p.open('wb'))),
|
||||
('keys', lambda p: xp.save(p.open('wb'), self.keys))
|
||||
))
|
||||
return util.to_disk(path, serializers, exclude)
|
||||
|
||||
|
@ -133,8 +168,9 @@ cdef class Vectors:
|
|||
self.key2row[key] = i
|
||||
|
||||
def load_vectors(path):
|
||||
xp = Model.ops.xp
|
||||
if path.exists():
|
||||
self.data = numpy.load(path)
|
||||
self.data = xp.load(path)
|
||||
|
||||
serializers = OrderedDict((
|
||||
('keys', load_keys),
|
||||
|
|
|
@ -27,6 +27,7 @@ from .vectors import Vectors
|
|||
from . import util
|
||||
from . import attrs
|
||||
from . import symbols
|
||||
from ._ml import link_vectors_to_models
|
||||
|
||||
|
||||
cdef class Vocab:
|
||||
|
@ -65,7 +66,7 @@ cdef class Vocab:
|
|||
self.strings.add(name)
|
||||
self.lex_attr_getters = lex_attr_getters
|
||||
self.morphology = Morphology(self.strings, tag_map, lemmatizer)
|
||||
self.vectors = Vectors(self.strings, 300)
|
||||
self.vectors = Vectors(self.strings)
|
||||
|
||||
property lang:
|
||||
def __get__(self):
|
||||
|
@ -261,7 +262,7 @@ cdef class Vocab:
|
|||
Words can be looked up by string or int ID.
|
||||
|
||||
RETURNS:
|
||||
A word vector. Size and shape determed by the
|
||||
A word vector. Size and shape determined by the
|
||||
vocab.vectors instance. Usually, a numpy ndarray
|
||||
of shape (300,) and dtype float32.
|
||||
|
||||
|
@ -323,6 +324,7 @@ cdef class Vocab:
|
|||
self.lexemes_from_bytes(file_.read())
|
||||
if self.vectors is not None:
|
||||
self.vectors.from_disk(path, exclude='strings.json')
|
||||
link_vectors_to_models(self)
|
||||
return self
|
||||
|
||||
def to_bytes(self, **exclude):
|
||||
|
@ -436,6 +438,7 @@ def unpickle_vocab(sstore, morphology, data_dir,
|
|||
vocab.lex_attr_getters = lex_attr_getters
|
||||
vocab.lexemes_from_bytes(lexemes_data)
|
||||
vocab.length = length
|
||||
link_vectors_to_models(vocab)
|
||||
return vocab
|
||||
|
||||
|
||||
|
|
|
@ -17,6 +17,7 @@ fi
|
|||
|
||||
if [ "${VIA}" == "compile" ]; then
|
||||
pip install -r requirements.txt
|
||||
python setup.py build_ext --inplace
|
||||
pip install -e .
|
||||
fi
|
||||
|
||||
|
|
|
@ -8,4 +8,5 @@ include _includes/_mixins
|
|||
| does not exist!
|
||||
|
||||
h2.c-landing__title.u-heading-3.u-padding-small
|
||||
a(href="javascript:history.go(-1)") Click here to go back.
|
||||
+button(false, true, "secondary-light")(href="javascript:history.go(-1)")
|
||||
| Click here to go back
|
||||
|
|
|
@ -3,24 +3,22 @@
|
|||
"landing": true,
|
||||
"logos": [
|
||||
{
|
||||
"quora": [ "https://www.quora.com", 150 ],
|
||||
"chartbeat": [ "https://chartbeat.com", 200 ],
|
||||
"duedil": [ "https://www.duedil.com", 150 ],
|
||||
"stitchfix": [ "https://www.stitchfix.com", 190 ]
|
||||
"airbnb": [ "https://www.airbnb.com", 150, 45],
|
||||
"quora": [ "https://www.quora.com", 120, 34 ],
|
||||
"retriever": [ "https://www.retriever.no", 150, 33 ],
|
||||
"stitchfix": [ "https://www.stitchfix.com", 150, 18 ]
|
||||
},
|
||||
{
|
||||
"wayblazer": [ "http://wayblazer.com", 200 ],
|
||||
"indico": [ "https://indico.io", 150 ],
|
||||
"chattermill": [ "https://chattermill.io", 175 ],
|
||||
"turi": [ "https://turi.com", 150 ],
|
||||
"kip": [ "http://kipthis.com", 70 ]
|
||||
},
|
||||
"chartbeat": [ "https://chartbeat.com", 180, 25 ],
|
||||
"allenai": [ "https://allenai.org", 220, 37 ]
|
||||
}
|
||||
],
|
||||
"features": [
|
||||
{
|
||||
"socrata": [ "https://www.socrata.com", 150 ],
|
||||
"cytora": [ "http://www.cytora.com", 125 ],
|
||||
"signaln": [ "http://signaln.com", 150 ],
|
||||
"wonderflow": [ "http://www.wonderflow.co", 200 ],
|
||||
"synapsify": [ "http://www.gosynapsify.com", 150 ]
|
||||
"thoughtworks": ["https://www.thoughtworks.com/radar/tools", 150, 28],
|
||||
"wapo": ["https://www.washingtonpost.com/news/wonk/wp/2016/05/18/googles-new-artificial-intelligence-cant-understand-these-sentences-can-you/", 100, 77],
|
||||
"venturebeat": ["https://venturebeat.com/2017/01/27/4-ai-startups-that-analyze-customer-reviews/", 150, 19],
|
||||
"microsoft": ["https://www.microsoft.com/developerblog/2016/09/13/training-a-classifier-for-relation-extraction-from-medical-literature/", 130, 28]
|
||||
}
|
||||
]
|
||||
},
|
||||
|
@ -34,7 +32,24 @@
|
|||
"landing": true
|
||||
},
|
||||
|
||||
"announcement" : {
|
||||
"title": "Important Announcement"
|
||||
"styleguide": {
|
||||
"title": "Styleguide",
|
||||
"sidebar": {
|
||||
"Styleguide": { "": "styleguide" },
|
||||
"Resources": {
|
||||
"Website Source": "https://github.com/explosion/spacy/tree/master/website",
|
||||
"Contributing Guide": "https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md"
|
||||
}
|
||||
},
|
||||
"menu": {
|
||||
"Introduction": "intro",
|
||||
"Logo": "logo",
|
||||
"Colors": "colors",
|
||||
"Typography": "typography",
|
||||
"Elements": "elements",
|
||||
"Components": "components",
|
||||
"Embeds": "embeds",
|
||||
"Markup Reference": "markup"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -11,12 +11,9 @@
|
|||
"COMPANY": "Explosion AI",
|
||||
"COMPANY_URL": "https://explosion.ai",
|
||||
"DEMOS_URL": "https://demos.explosion.ai",
|
||||
"MODELS_REPO": "explosion/spacy-models",
|
||||
|
||||
"SPACY_VERSION": "1.8",
|
||||
"LATEST_NEWS": {
|
||||
"url": "https://github.com/explosion/spaCy/releases/tag/v2.0.0-alpha",
|
||||
"title": "Test spaCy v2.0.0 alpha!"
|
||||
},
|
||||
"SPACY_VERSION": "2.0",
|
||||
|
||||
"SOCIAL": {
|
||||
"twitter": "spacy_io",
|
||||
|
@ -27,25 +24,23 @@
|
|||
},
|
||||
|
||||
"NAVIGATION": {
|
||||
"Home": "/",
|
||||
"Usage": "/docs/usage",
|
||||
"Reference": "/docs/api",
|
||||
"Demos": "/docs/usage/showcase",
|
||||
"Blog": "https://explosion.ai/blog"
|
||||
"Usage": "/usage",
|
||||
"Models": "/models",
|
||||
"API": "/api"
|
||||
},
|
||||
|
||||
"FOOTER": {
|
||||
"spaCy": {
|
||||
"Usage": "/docs/usage",
|
||||
"API Reference": "/docs/api",
|
||||
"Tutorials": "/docs/usage/tutorials",
|
||||
"Showcase": "/docs/usage/showcase"
|
||||
"Usage": "/usage",
|
||||
"Models": "/models",
|
||||
"API Reference": "/api",
|
||||
"Resources": "/usage/resources"
|
||||
},
|
||||
"Support": {
|
||||
"Issue Tracker": "https://github.com/explosion/spaCy/issues",
|
||||
"StackOverflow": "http://stackoverflow.com/questions/tagged/spacy",
|
||||
"Reddit usergroup": "https://www.reddit.com/r/spacynlp/",
|
||||
"Gitter chat": "https://gitter.im/explosion/spaCy"
|
||||
"Reddit Usergroup": "https://www.reddit.com/r/spacynlp/",
|
||||
"Gitter Chat": "https://gitter.im/explosion/spaCy"
|
||||
},
|
||||
"Connect": {
|
||||
"Twitter": "https://twitter.com/spacy_io",
|
||||
|
@ -74,21 +69,11 @@
|
|||
{"id": "venv", "title": "virtualenv", "help": "Use a virtual environment and install spaCy into a user directory" },
|
||||
{"id": "gpu", "title": "GPU", "help": "Run spaCy on GPU to make it faster. Requires an NVDIA graphics card with CUDA 2+. See section below for more info."}]
|
||||
},
|
||||
{ "id": "model", "title": "Models", "multiple": true, "options": [
|
||||
{ "id": "en", "title": "English", "meta": "50MB" },
|
||||
{ "id": "de", "title": "German", "meta": "645MB" },
|
||||
{ "id": "fr", "title": "French", "meta": "1.33GB" },
|
||||
{ "id": "es", "title": "Spanish", "meta": "377MB"}]
|
||||
}
|
||||
{ "id": "model", "title": "Models", "multiple": true }
|
||||
],
|
||||
|
||||
"QUICKSTART_MODELS": [
|
||||
{ "id": "lang", "title": "Language", "options": [
|
||||
{ "id": "en", "title": "English", "checked": true },
|
||||
{ "id": "de", "title": "German" },
|
||||
{ "id": "fr", "title": "French" },
|
||||
{ "id": "es", "title": "Spanish" }]
|
||||
},
|
||||
{ "id": "lang", "title": "Language"},
|
||||
{ "id": "load", "title": "Loading style", "options": [
|
||||
{ "id": "spacy", "title": "Use spacy.load()", "checked": true, "help": "Use spaCy's built-in loader to load the model by name." },
|
||||
{ "id": "module", "title": "Import as module", "help": "Import the model explicitly as a Python module." }]
|
||||
|
@ -98,50 +83,15 @@
|
|||
}
|
||||
],
|
||||
|
||||
"MODELS": {
|
||||
"en": [
|
||||
{ "id": "en_core_web_sm", "lang": "English", "feats": [1, 1, 1, 1], "size": "50 MB", "license": "CC BY-SA", "def": true },
|
||||
{ "id": "en_core_web_md", "lang": "English", "feats": [1, 1, 1, 1], "size": "1 GB", "license": "CC BY-SA" },
|
||||
{ "id": "en_depent_web_md", "lang": "English", "feats": [1, 1, 1, 0], "size": "328 MB", "license": "CC BY-SA" },
|
||||
{ "id": "en_vectors_glove_md", "lang": "English", "feats": [1, 0, 0, 1], "size": "727 MB", "license": "CC BY-SA" }
|
||||
],
|
||||
"de": [
|
||||
{ "id": "de_core_news_md", "lang": "German", "feats": [1, 1, 1, 1], "size": "645 MB", "license": "CC BY-SA" }
|
||||
],
|
||||
"fr": [
|
||||
{ "id": "fr_depvec_web_lg", "lang": "French", "feats": [1, 1, 0, 1], "size": "1.33 GB", "license": "CC BY-NC" }
|
||||
],
|
||||
"es": [
|
||||
{ "id": "es_core_web_md", "lang": "Spanish", "feats": [1, 1, 1, 1], "size": "377 MB", "license": "CC BY-SA"}
|
||||
]
|
||||
},
|
||||
|
||||
"EXAMPLE_SENTENCES": {
|
||||
"en": "This is a sentence.",
|
||||
"de": "Dies ist ein Satz.",
|
||||
"fr": "C'est une phrase.",
|
||||
"es": "Esto es una frase."
|
||||
},
|
||||
|
||||
"ALPHA": true,
|
||||
"V_CSS": "1.6",
|
||||
"V_JS": "1.2",
|
||||
"V_CSS": "2.0",
|
||||
"V_JS": "2.0",
|
||||
"DEFAULT_SYNTAX": "python",
|
||||
"ANALYTICS": "UA-58931649-1",
|
||||
"MAILCHIMP": {
|
||||
"user": "spacy.us12",
|
||||
"id": "83b0498b1e7fa3c91ce68c3f1",
|
||||
"list": "89ad33e698"
|
||||
},
|
||||
"BADGES": {
|
||||
"pipy": {
|
||||
"badge": "https://img.shields.io/pypi/v/spacy.svg?style=flat-square",
|
||||
"link": "https://pypi.python.org/pypi/spacy"
|
||||
},
|
||||
"conda": {
|
||||
"badge": "https://anaconda.org/conda-forge/spacy/badges/version.svg",
|
||||
"link": "https://anaconda.org/conda-forge/spacy"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
//- 💫 INCLUDES > FOOTER
|
||||
|
||||
include _mixins
|
||||
|
||||
footer.o-footer.u-text.u-border-dotted
|
||||
footer.o-footer.u-text
|
||||
+grid.o-content
|
||||
each group, label in FOOTER
|
||||
+grid-col("quarter")
|
||||
|
@ -13,18 +11,18 @@ footer.o-footer.u-text.u-border-dotted
|
|||
li
|
||||
+a(url)=item
|
||||
|
||||
if SECTION != "docs"
|
||||
if SECTION == "index"
|
||||
+grid-col("quarter")
|
||||
include _newsletter
|
||||
|
||||
if SECTION == "docs"
|
||||
if SECTION != "index"
|
||||
.o-content.o-block.u-border-dotted
|
||||
include _newsletter
|
||||
|
||||
.o-inline-list.u-text-center.u-text-tiny.u-color-subtle
|
||||
span © 2016-#{new Date().getFullYear()} #[+a(COMPANY_URL, true)=COMPANY]
|
||||
|
||||
+a(COMPANY_URL, true)
|
||||
+svg("graphics", "explosion", 45).o-icon.u-color-theme.u-grayscale
|
||||
+a(COMPANY_URL, true)(aria-label="Explosion AI")
|
||||
+icon("explosion", 45).o-icon.u-color-theme.u-grayscale
|
||||
|
||||
+a(COMPANY_URL + "/legal", true) Legal / Imprint
|
||||
|
|
|
@ -1,35 +1,71 @@
|
|||
//- 💫 INCLUDES > FUNCTIONS
|
||||
|
||||
//- More descriptive variables for current.path and current.source
|
||||
//- Descriptive variables, available in the global scope
|
||||
|
||||
- CURRENT = current.source
|
||||
- SECTION = current.path[0]
|
||||
- SUBSECTION = current.path[1]
|
||||
- LANGUAGES = public.models._data.LANGUAGES
|
||||
- MODELS = public.models._data.MODELS
|
||||
- CURRENT_MODELS = MODELS[current.source] || []
|
||||
|
||||
- MODEL_COUNT = Object.keys(MODELS).map(m => Object.keys(MODELS[m]).length).reduce((a, b) => a + b)
|
||||
- MODEL_LANG_COUNT = Object.keys(MODELS).length
|
||||
- LANG_COUNT = Object.keys(LANGUAGES).length
|
||||
|
||||
- MODEL_META = public.models._data.MODEL_META
|
||||
- MODEL_LICENSES = public.models._data.MODEL_LICENSES
|
||||
- MODEL_ACCURACY = public.models._data.MODEL_ACCURACY
|
||||
- EXAMPLE_SENTENCES = public.models._data.EXAMPLE_SENTENCES
|
||||
|
||||
- IS_PAGE = (SECTION != "index") && !landing
|
||||
- IS_MODELS = (SECTION == "models" && LANGUAGES[current.source])
|
||||
- HAS_MODELS = IS_MODELS && CURRENT_MODELS.length
|
||||
|
||||
|
||||
//- Add prefixes to items of an array (for modifier CSS classes)
|
||||
array - [array] list of class names or options, e.g. ["foot"]
|
||||
prefix - [string] prefix to add to each class, e.g. "c-table__row"
|
||||
RETURNS - [array] list of modified class names
|
||||
|
||||
- function prefixArgs(array, prefix) {
|
||||
- return array.map(function(arg) {
|
||||
- return prefix + '--' + arg;
|
||||
- }).join(' ');
|
||||
- return array.map(arg => prefix + '--' + arg).join(' ');
|
||||
- }
|
||||
|
||||
|
||||
//- Convert API paths (semi-temporary fix for renamed sections)
|
||||
path - [string] link path supplied to +api mixin
|
||||
RETURNS - [string] new link path to correct location
|
||||
|
||||
- function convertAPIPath(path) {
|
||||
- if (path.startsWith('spacy#') || path.startsWith('displacy#') || path.startsWith('util#')) {
|
||||
- var comps = path.split('#');
|
||||
- return "top-level#" + comps[0] + '.' + comps[1];
|
||||
- }
|
||||
- else if (path.startsWith('cli#')) {
|
||||
- return "top-level#" + path.split('#')[1];
|
||||
- }
|
||||
- return path;
|
||||
- }
|
||||
|
||||
|
||||
//- Get model components from ID. Components can then be looked up in LANGUAGES
|
||||
and MODEL_META respectively, to get their human-readable form.
|
||||
id - [string] model ID, e.g. "en_core_web_sm"
|
||||
RETURNS - [object] object keyed by components lang, type, genre and size
|
||||
|
||||
- function getModelComponents(id) {
|
||||
- var comps = id.split('_');
|
||||
- return {'lang': comps[0], 'type': comps[1], 'genre': comps[2], 'size': comps[3]}
|
||||
- }
|
||||
|
||||
|
||||
//- Generate GitHub links
|
||||
repo - [string] name of repo owned by explosion
|
||||
filepath - [string] logical path to file relative to repository root
|
||||
branch - [string] optional branch, defaults to "master"
|
||||
RETURNS - [string] the correct link to the file on GitHub
|
||||
|
||||
- function gh(repo, filepath, branch) {
|
||||
- var branch = ALPHA ? 'develop' : branch
|
||||
- return 'https://github.com/' + SOCIAL.github + '/' + repo + (filepath ? '/blob/' + (branch || 'master') + '/' + filepath : '' );
|
||||
- }
|
||||
|
||||
|
||||
//- Get social images
|
||||
|
||||
- function getSocialImg() {
|
||||
- var base = SITE_URL + '/assets/img/social/preview_'
|
||||
- var image = ALPHA ? 'alpha' : 'default'
|
||||
- if (preview) image = preview
|
||||
- else if (SECTION == 'docs' && !ALPHA) image = 'docs'
|
||||
- return base + image + '.jpg'
|
||||
- return 'https://github.com/' + SOCIAL.github + '/' + (repo || '') + (filepath ? '/blob/' + (branch || 'master') + '/' + filepath : '' );
|
||||
- }
|
||||
|
|
|
@ -1,5 +1,13 @@
|
|||
//- 💫 MIXINS > BASE
|
||||
|
||||
//- Section
|
||||
id - [string] anchor assigned to section (used for breadcrumb navigation)
|
||||
|
||||
mixin section(id)
|
||||
section.o-section(id="section-" + id data-section=id)
|
||||
block
|
||||
|
||||
|
||||
//- Aside wrapper
|
||||
label - [string] aside label
|
||||
|
||||
|
@ -11,34 +19,26 @@ mixin aside-wrapper(label)
|
|||
|
||||
block
|
||||
|
||||
//- Date
|
||||
input - [string] date in the format YYYY-MM-DD
|
||||
|
||||
mixin date(input)
|
||||
- var date = new Date(input)
|
||||
- var months = [ 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December' ]
|
||||
|
||||
time(datetime=JSON.parse(JSON.stringify(date)))&attributes(attributes)=months[date.getMonth()] + ' ' + date.getDate() + ', ' + date.getFullYear()
|
||||
|
||||
|
||||
//- SVG from map
|
||||
file - [string] SVG file name in /assets/img/
|
||||
//- SVG from map (uses embedded SVG sprite)
|
||||
name - [string] SVG symbol id
|
||||
width - [integer] width in px
|
||||
height - [integer] height in px (default: same as width)
|
||||
|
||||
mixin svg(file, name, width, height)
|
||||
mixin svg(name, width, height)
|
||||
svg(aria-hidden="true" viewBox="0 0 #{width} #{height || width}" width=width height=(height || width))&attributes(attributes)
|
||||
use(xlink:href="/assets/img/#{file}.svg##{name}")
|
||||
use(xlink:href="#svg_#{name}")
|
||||
|
||||
|
||||
//- Icon
|
||||
name - [string] icon name, should be SVG symbol ID
|
||||
size - [integer] icon width and height (default: 20)
|
||||
name - [string] icon name (will be used as symbol id: #svg_{name})
|
||||
width - [integer] icon width (default: 20)
|
||||
height - [integer] icon height (defaults to width)
|
||||
|
||||
mixin icon(name, size)
|
||||
- var size = size || 20
|
||||
+svg("icons", name, size).o-icon(style="min-width: #{size}px")&attributes(attributes)
|
||||
mixin icon(name, width, height)
|
||||
- var width = width || 20
|
||||
- var height = height || width
|
||||
+svg(name, width, height).o-icon(style="min-width: #{width}px")&attributes(attributes)
|
||||
|
||||
|
||||
//- Pro/Con/Neutral icon
|
||||
|
@ -46,8 +46,8 @@ mixin icon(name, size)
|
|||
size - [integer] icon size (optional)
|
||||
|
||||
mixin procon(icon, size)
|
||||
- colors = { pro: "green", con: "red", neutral: "yellow" }
|
||||
+icon(icon, size)(class="u-color-#{colors[icon] || 'subtle'}" aria-label=icon)&attributes(attributes)
|
||||
- colors = { pro: "green", con: "red", neutral: "subtle" }
|
||||
+icon("circle", size || 16)(class="u-color-#{colors[icon] || 'subtle'}" aria-label=icon)&attributes(attributes)
|
||||
|
||||
|
||||
//- Headlines Helper Mixin
|
||||
|
@ -80,8 +80,7 @@ mixin headline(level)
|
|||
|
||||
mixin permalink(id)
|
||||
if id
|
||||
a.u-permalink(id=id href="##{id}")
|
||||
+icon("anchor").u-permalink__icon
|
||||
a.u-permalink(href="##{id}")
|
||||
block
|
||||
|
||||
else
|
||||
|
@ -109,7 +108,7 @@ mixin quickstart(groups, headline, description, hide_results)
|
|||
.c-quickstart__fields
|
||||
for option in group.options
|
||||
input.c-quickstart__input(class="c-quickstart__input--" + (group.input_style ? group.input_style : group.multiple ? "check" : "radio") type=group.multiple ? "checkbox" : "radio" name=group.id id="qs-#{option.id}" value=option.id checked=option.checked)
|
||||
label.c-quickstart__label(for="qs-#{option.id}")!=option.title
|
||||
label.c-quickstart__label.u-text-tiny(for="qs-#{option.id}")!=option.title
|
||||
if option.meta
|
||||
| #[span.c-quickstart__label__meta (#{option.meta})]
|
||||
if option.help
|
||||
|
@ -122,12 +121,10 @@ mixin quickstart(groups, headline, description, hide_results)
|
|||
code.c-code-block__content.c-quickstart__code(data-qs-results="")
|
||||
block
|
||||
|
||||
.c-quickstart__info.u-text-tiny.o-block.u-text-right
|
||||
| Like this widget? Check out #[+a("https://github.com/ines/quickstart").u-link quickstart.js]!
|
||||
|
||||
|
||||
//- Quickstart code item
|
||||
data [object] - Rendering conditions (keyed by option group ID, value: option)
|
||||
data - [object] Rendering conditions (keyed by option group ID, value: option)
|
||||
style - [string] modifier ID for line style
|
||||
|
||||
mixin qs(data, style)
|
||||
- args = {}
|
||||
|
@ -148,6 +145,13 @@ mixin terminal(label)
|
|||
+code.x-terminal__code
|
||||
block
|
||||
|
||||
//- Chart.js
|
||||
id - [string] chart ID, will be assigned as #chart_{id}
|
||||
|
||||
mixin chart(id)
|
||||
figure.o-block&attributes(attributes)
|
||||
canvas(id="chart_#{id}" width="800" height="400" style="max-width: 100%")
|
||||
|
||||
|
||||
//- Gitter chat button and widget
|
||||
button - [string] text shown on button
|
||||
|
@ -156,26 +160,24 @@ mixin terminal(label)
|
|||
mixin gitter(button, label)
|
||||
aside.js-gitter.c-chat.is-collapsed(data-title=(label || button))
|
||||
|
||||
button.js-gitter-button.c-chat__button.u-text-small
|
||||
+icon("chat").o-icon--inline
|
||||
button.js-gitter-button.c-chat__button.u-text-tag
|
||||
+icon("chat", 16).o-icon--inline
|
||||
!=button
|
||||
|
||||
|
||||
//- Badge
|
||||
name - [string] "pipy" or "conda"
|
||||
image - [string] path to badge image
|
||||
url - [string] badge link
|
||||
|
||||
mixin badge(name)
|
||||
- site = BADGES[name]
|
||||
|
||||
if site
|
||||
+a(site.link).u-padding-small
|
||||
img(src=site.badge alt="{name} version" height="20")
|
||||
mixin badge(image, url)
|
||||
+a(url).u-padding-small.u-hide-link&attributes(attributes)
|
||||
img.o-badge(src=image alt=url height="20")
|
||||
|
||||
|
||||
//- Logo
|
||||
//- spaCy logo
|
||||
|
||||
mixin logo()
|
||||
+svg("graphics", "spacy", 675, 215).o-logo&attributes(attributes)
|
||||
+svg("spacy", 675, 215).o-logo&attributes(attributes)
|
||||
|
||||
|
||||
//- Landing
|
||||
|
@ -186,18 +188,56 @@ mixin landing-header()
|
|||
.c-landing__content
|
||||
block
|
||||
|
||||
mixin landing-banner(headline, label)
|
||||
.c-landing__banner.u-padding.o-block.u-color-light
|
||||
+grid.c-landing__banner__content.o-no-block
|
||||
+grid-col("third")
|
||||
h3.u-heading.u-heading-1
|
||||
if label
|
||||
div
|
||||
span.u-text-label.u-text-label--light=label
|
||||
!=headline
|
||||
|
||||
mixin landing-badge(url, graphic, alt, size)
|
||||
+a(url)(aria-label=alt title=alt).c-landing__badge
|
||||
+svg("graphics", graphic, size || 225)
|
||||
+grid-col("two-thirds").c-landing__banner__text
|
||||
block
|
||||
|
||||
|
||||
mixin landing-logos(title, logos)
|
||||
.o-content.u-text-center&attributes(attributes)
|
||||
h3.u-heading.u-text-label.u-color-dark=title
|
||||
|
||||
each row, i in logos
|
||||
- var is_last = i == logos.length - 1
|
||||
+grid("center").o-inline-list.o-no-block(class=is_last ? "o-no-block" : null)
|
||||
each details, name in row
|
||||
+a(details[0]).u-padding-medium
|
||||
+icon(name, details[1], details[2])
|
||||
|
||||
if is_last
|
||||
block
|
||||
|
||||
|
||||
//- Under construction (temporary)
|
||||
Marks sections that still need to be completed for the v2.0 release.
|
||||
|
||||
mixin under-construction()
|
||||
+infobox("🚧 Under construction")
|
||||
+infobox("Under construction", "🚧")
|
||||
| This section is still being written and will be updated for the v2.0
|
||||
| release. Is there anything that you think should definitely mentioned or
|
||||
| explained here? Any examples you'd like to see? #[strong Let us know]
|
||||
| on the #[+a(gh("spacy") + "/issues/1105") v2.0 alpha thread] on GitHub!
|
||||
|
||||
|
||||
//- Alpha infobox (temporary)
|
||||
Added in the templates to notify user that they're visiting the alpha site.
|
||||
|
||||
mixin alpha-info()
|
||||
+infobox("You are viewing the spaCy v2.0.0 alpha docs", "⚠️")
|
||||
strong This page is part of the alpha documentation for spaCy v2.0.
|
||||
| It does not reflect the state of the latest stable release.
|
||||
| Because v2.0 is still under development, the implementation
|
||||
| may differ from the intended state described here. See the
|
||||
| #[+a(gh("spaCy") + "/releases/tag/v2.0.0-alpha") release notes]
|
||||
| for details on how to install and test the new version. To
|
||||
| read the official docs for spaCy v1.x,
|
||||
| #[+a("https://spacy.io/docs") go here].
|
||||
|
|
|
@ -8,11 +8,15 @@ include _mixins-base
|
|||
level - [integer] headline level, corresponds to h1, h2, h3 etc.
|
||||
id - [string] unique identifier, creates permalink (optional)
|
||||
|
||||
mixin h(level, id)
|
||||
+headline(level).u-heading&attributes(attributes)
|
||||
mixin h(level, id, source)
|
||||
+headline(level).u-heading(id=id)&attributes(attributes)
|
||||
+permalink(id)
|
||||
block
|
||||
|
||||
if source
|
||||
+button(gh("spacy", source), false, "secondary", "small").u-nowrap.u-float-right
|
||||
span Source #[+icon("code", 14).o-icon--inline]
|
||||
|
||||
|
||||
//- External links
|
||||
url - [string] link href
|
||||
|
@ -38,21 +42,23 @@ mixin src(url)
|
|||
|
||||
|
||||
//- API link (with added tag and automatically generated path)
|
||||
path - [string] path to API docs page relative to /docs/api/
|
||||
path - [string] path to API docs page relative to /api/
|
||||
|
||||
mixin api(path)
|
||||
+a("/docs/api/" + path, true)(target="_self").u-no-border.u-inline-block.u-nowrap
|
||||
- path = convertAPIPath(path)
|
||||
+a("/api/" + path, true)(target="_self").u-no-border.u-inline-block.u-nowrap
|
||||
block
|
||||
|
||||
| #[+icon("book", 18).o-icon--inline.u-color-theme]
|
||||
| #[+icon("book", 16).o-icon--inline.u-color-theme]
|
||||
|
||||
|
||||
//- Help icon with tooltip
|
||||
tooltip - [string] Tooltip text
|
||||
icon_size - [integer] Optional size of help icon in px.
|
||||
|
||||
mixin help(tooltip)
|
||||
mixin help(tooltip, icon_size)
|
||||
span(data-tooltip=tooltip)&attributes(attributes)
|
||||
+icon("help", 16).i-icon--inline
|
||||
+icon("help", icon_size || 16).o-icon--inline
|
||||
|
||||
|
||||
//- Aside for text
|
||||
|
@ -68,24 +74,43 @@ mixin aside(label)
|
|||
label - [string] aside title (optional or false for no label)
|
||||
language - [string] language for syntax highlighting (default: "python")
|
||||
supports basic relevant languages available for PrismJS
|
||||
prompt - [string] prompt displayed before first line, e.g. "$"
|
||||
|
||||
mixin aside-code(label, language)
|
||||
mixin aside-code(label, language, prompt)
|
||||
+aside-wrapper(label)
|
||||
+code(false, language).o-no-block
|
||||
+code(false, language, prompt).o-no-block
|
||||
block
|
||||
|
||||
|
||||
//- Infobox
|
||||
label - [string] infobox title (optional or false for no title)
|
||||
emoji - [string] optional emoji displayed before the title, necessary as
|
||||
argument to be able to wrap it for spacing
|
||||
|
||||
mixin infobox(label)
|
||||
mixin infobox(label, emoji)
|
||||
aside.o-box.o-block.u-text-small
|
||||
if label
|
||||
h3.u-text-label.u-color-theme=label
|
||||
h3.u-heading.u-text-label.u-color-theme
|
||||
if emoji
|
||||
span.o-emoji=emoji
|
||||
| #{label}
|
||||
|
||||
block
|
||||
|
||||
|
||||
//- Logos displayed in the top corner of some infoboxes
|
||||
logos - [array] List of icon ID, width, height and link.
|
||||
|
||||
mixin infobox-logos(...logos)
|
||||
.o-box__logos.u-text-right.u-float-right
|
||||
for logo in logos
|
||||
if logo[3]
|
||||
| #[+a(logo[3]).u-inline-block.u-hide-link.u-padding-small #[+icon(logo[0], logo[1], logo[2]).u-color-dark]]
|
||||
else
|
||||
| #[+icon(logo[0], logo[1], logo[2]).u-color-dark]
|
||||
|
||||
|
||||
|
||||
//- Link button
|
||||
url - [string] link href
|
||||
trusted - [boolean] if not set / false, rel="noopener nofollow" is added
|
||||
|
@ -94,7 +119,7 @@ mixin infobox(label)
|
|||
see assets/css/_components/_buttons.sass
|
||||
|
||||
mixin button(url, trusted, ...style)
|
||||
- external = url.includes("http")
|
||||
- external = url && url.includes("http")
|
||||
a.c-button.u-text-label(href=url class=prefixArgs(style, "c-button") role="button" target=external ? "_blank" : null rel=external && !trusted ? "noopener nofollow" : null)&attributes(attributes)
|
||||
block
|
||||
|
||||
|
@ -103,31 +128,33 @@ mixin button(url, trusted, ...style)
|
|||
label - [string] aside title (optional or false for no label)
|
||||
language - [string] language for syntax highlighting (default: "python")
|
||||
supports basic relevant languages available for PrismJS
|
||||
prompt - [string] prompt or icon to display next to code block, (mostly used for old/new)
|
||||
prompt - [string] prompt displayed before first line, e.g. "$"
|
||||
height - [integer] optional height to clip code block to
|
||||
icon - [string] icon displayed next to code block (e.g. "accept" for new code)
|
||||
wrap - [boolean] wrap text and disable horizontal scrolling
|
||||
|
||||
mixin code(label, language, prompt, height)
|
||||
mixin code(label, language, prompt, height, icon, wrap)
|
||||
pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : null style=height ? "height: #{height}px" : null)&attributes(attributes)
|
||||
if label
|
||||
h4.u-text-label.u-text-label--dark=label
|
||||
- var icon = (prompt == 'accept' || prompt == 'reject')
|
||||
- var icon = icon || (prompt == 'accept' || prompt == 'reject')
|
||||
if icon
|
||||
- var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'}
|
||||
.c-code-block__icon(class=classes[icon] || null class=classes[icon] ? "c-code-block__icon--border" : null)
|
||||
+icon(icon, 18)
|
||||
|
||||
code.c-code-block__content(data-prompt=icon ? null : prompt)
|
||||
code.c-code-block__content(class=wrap ? "u-wrap" : null data-prompt=icon ? null : prompt)
|
||||
block
|
||||
|
||||
|
||||
//- Code blocks to display old/new versions
|
||||
|
||||
mixin code-old()
|
||||
+code(false, false, "reject").o-block-small
|
||||
+code(false, false, false, false, "reject").o-block-small
|
||||
block
|
||||
|
||||
mixin code-new()
|
||||
+code(false, false, "accept").o-block-small
|
||||
+code(false, false, false, false, "accept").o-block-small
|
||||
block
|
||||
|
||||
|
||||
|
@ -138,12 +165,33 @@ mixin code-new()
|
|||
|
||||
mixin codepen(slug, height, default_tab)
|
||||
figure.o-block(style="min-height: #{height}px")&attributes(attributes)
|
||||
.codepen(data-height=height data-theme-id="26467" data-slug-hash=slug data-default-tab=(default_tab || "result") data-embed-version="2" data-user=SOCIAL.codepen)
|
||||
.codepen(data-height=height data-theme-id="31335" data-slug-hash=slug data-default-tab=(default_tab || "result") data-embed-version="2" data-user=SOCIAL.codepen)
|
||||
+a("https://codepen.io/" + SOCIAL.codepen + "/" + slug) View on CodePen
|
||||
|
||||
script(async src="https://assets.codepen.io/assets/embed/ei.js")
|
||||
|
||||
|
||||
//- GitHub embed
|
||||
repo - [string] repository owned by explosion organization
|
||||
file - [string] logical path to file, relative to repository root
|
||||
alt_file - [string] alternative file path used in footer and link button
|
||||
height - [integer] height of code preview in px
|
||||
|
||||
mixin github(repo, file, alt_file, height)
|
||||
- var branch = ALPHA ? "develop" : "master"
|
||||
- var height = height || 250
|
||||
|
||||
figure.o-block
|
||||
pre.c-code-block.o-block-small(class="lang-#{(language || DEFAULT_SYNTAX)}" style="height: #{height}px; min-height: #{height}px")
|
||||
code.c-code-block__content(data-gh-embed="#{repo}/#{branch}/#{file}")
|
||||
|
||||
footer.o-grid.u-text
|
||||
.o-block-small.u-flex-full #[+icon("github")] #[code=repo + '/' + (alt_file || file)]
|
||||
div
|
||||
+button(gh(repo, alt_file || file), false, "primary", "small") View on GitHub
|
||||
|
||||
|
||||
|
||||
//- Images / figures
|
||||
url - [string] url or path to image
|
||||
width - [integer] image width in px, for better rendering (default: 500)
|
||||
|
@ -168,10 +216,26 @@ mixin image-caption()
|
|||
block
|
||||
|
||||
|
||||
//- Label
|
||||
//- Graphic or illustration with button
|
||||
original - [string] Path to original image
|
||||
|
||||
mixin graphic(original)
|
||||
+image
|
||||
block
|
||||
if original
|
||||
.u-text-right
|
||||
+button(original, false, "secondary", "small") View large graphic
|
||||
|
||||
|
||||
//- Labels
|
||||
|
||||
mixin label()
|
||||
.u-text-label.u-color-subtle&attributes(attributes)
|
||||
.u-text-label.u-color-dark&attributes(attributes)
|
||||
block
|
||||
|
||||
|
||||
mixin label-inline()
|
||||
strong.u-text-label.u-color-dark&attributes(attributes)
|
||||
block
|
||||
|
||||
|
||||
|
@ -188,7 +252,9 @@ mixin tag()
|
|||
mixin tag-model(...capabs)
|
||||
- var intro = "To use this functionality, spaCy needs a model to be installed"
|
||||
- var ext = capabs.length ? " that supports the following capabilities: " + capabs.join(', ') : ""
|
||||
+tag Requires model
|
||||
|
||||
span.u-nowrap
|
||||
+tag Needs model
|
||||
+help(intro + ext + ".").u-color-theme
|
||||
|
||||
|
||||
|
@ -219,13 +285,7 @@ mixin list(type, start)
|
|||
|
||||
//- List item (only used within +list)
|
||||
|
||||
mixin item(procon)
|
||||
if procon
|
||||
li&attributes(attributes)
|
||||
+procon(procon).c-list__icon
|
||||
block
|
||||
|
||||
else
|
||||
mixin item()
|
||||
li.c-list__item&attributes(attributes)
|
||||
block
|
||||
|
||||
|
@ -237,9 +297,9 @@ mixin table(head)
|
|||
table.c-table.o-block&attributes(attributes)
|
||||
|
||||
if head
|
||||
+row
|
||||
+row("head")
|
||||
each column in head
|
||||
th.c-table__head-cell.u-text-label=column
|
||||
+head-cell=column
|
||||
|
||||
block
|
||||
|
||||
|
@ -251,10 +311,11 @@ mixin row(...style)
|
|||
block
|
||||
|
||||
|
||||
//- Footer table row (only ued within +table)
|
||||
|
||||
mixin footrow()
|
||||
tr.c-table__row.c-table__row--foot&attributes(attributes)
|
||||
//- Header table cell (only used within +row)
|
||||
|
||||
mixin head-cell()
|
||||
th.c-table__head-cell.u-text-label&attributes(attributes)
|
||||
block
|
||||
|
||||
|
||||
|
@ -285,70 +346,57 @@ mixin grid-col(width)
|
|||
|
||||
//- Card (only used within +grid)
|
||||
title - [string] card title
|
||||
details - [object] url, image, author, description, tags etc.
|
||||
(see /docs/usage/_data.json)
|
||||
|
||||
mixin card(title, details)
|
||||
+grid-col("half").o-card.u-text&attributes(attributes)
|
||||
if details.image
|
||||
+a(details.url).o-block-small
|
||||
img(src=details.image alt=title width="300" role="presentation")
|
||||
url - [string] link for card
|
||||
author - [string] optional author, displayed as byline at the bottom
|
||||
icon - [string] optional ID of icon displayed with card
|
||||
width - [string] optional width of grid column, defaults to "half"
|
||||
|
||||
mixin card(title, url, author, icon, width)
|
||||
+grid-col(width || "half").o-box.o-grid.o-grid--space.u-text&attributes(attributes)
|
||||
+a(url)
|
||||
h4.u-heading.u-text-label
|
||||
if icon
|
||||
+icon(icon, 25).u-float-right
|
||||
if title
|
||||
+a(details.url)
|
||||
+h(3)=title
|
||||
span.u-color-dark=title
|
||||
.o-block-small.u-text-small
|
||||
block
|
||||
if author
|
||||
.u-color-subtle.u-text-tiny by #{author}
|
||||
|
||||
if details.author
|
||||
.u-text-small.u-color-subtle by #{details.author}
|
||||
|
||||
if details.description || details.tags
|
||||
ul
|
||||
if details.description
|
||||
li=details.description
|
||||
|
||||
if details.tags
|
||||
li
|
||||
each tag in details.tags
|
||||
span.u-text-tag #{tag}
|
||||
|
|
||||
//- Table of contents, to be used with +item mixins for links
|
||||
col - [string] width of column (see +grid-col)
|
||||
|
||||
mixin table-of-contents(col)
|
||||
+grid-col(col || "half")
|
||||
+infobox
|
||||
+label.o-block-small Table of contents
|
||||
+list("numbers").u-text-small.o-no-block
|
||||
block
|
||||
|
||||
|
||||
//- Simpler card list item (only used within +list)
|
||||
title - [string] card title
|
||||
details - [object] url, image, author, description, tags etc.
|
||||
(see /docs/usage/_data.json)
|
||||
//- Bibliography
|
||||
id - [string] ID of bibliography component, for anchor links. Can be used if
|
||||
there's more than one bibliography on one page.
|
||||
|
||||
mixin card-item(title, details)
|
||||
+item&attributes(attributes)
|
||||
+a(details.url)=title
|
||||
|
||||
if details.description
|
||||
br
|
||||
span=details.description
|
||||
|
||||
if details.author
|
||||
br
|
||||
span.u-text-small.u-color-subtle by #{details.author}
|
||||
mixin bibliography(id)
|
||||
section(id=id || "bibliography")
|
||||
+infobox
|
||||
+label.o-block-small Bibliography
|
||||
+list("numbers").u-text-small.o-no-block
|
||||
block
|
||||
|
||||
|
||||
//- Table row for models table
|
||||
//- Footnote
|
||||
id - [string / integer] ID of footnote.
|
||||
bib_id - [string] ID of bibliography component, defaults to "bibliography".
|
||||
tooltip - [string] optional text displayed as tooltip
|
||||
|
||||
mixin model-row(name, lang, procon, size, license, default_model, divider)
|
||||
- var licenses = { "CC BY-SA": "https://creativecommons.org/licenses/by-sa/3.0/", "CC BY-NC": "https://creativecommons.org/licenses/by-nc/3.0/" }
|
||||
|
||||
+row(divider ? "divider": null)
|
||||
+cell #[code=name]
|
||||
if default_model
|
||||
| #[span.u-color-theme(title="default model") #[+icon("star", 16)]]
|
||||
+cell=lang
|
||||
each icon in procon
|
||||
+cell.u-text-center #[+procon(icon ? "pro" : "con")]
|
||||
+cell.u-text-right=size
|
||||
+cell
|
||||
if license in licenses
|
||||
+a(licenses[license])=license
|
||||
mixin fn(id, bib_id, tooltip)
|
||||
sup.u-padding-small(id="bib" + id data-tooltip=tooltip)
|
||||
span.u-text-tag
|
||||
+a("#" + (bib_id || "bibliography")).u-hide-link #{id}
|
||||
|
||||
|
||||
//- Table rows for annotation specs
|
||||
|
@ -383,14 +431,3 @@ mixin annotation-row(annots, style)
|
|||
else
|
||||
+cell=cell
|
||||
block
|
||||
|
||||
|
||||
//- Table of contents, to be used with +item mixins for links
|
||||
col - [string] width of column (see +grid-col)
|
||||
|
||||
mixin table-of-contents(col)
|
||||
+grid-col(col || "half")
|
||||
+infobox
|
||||
+label.o-block-small Table of contents
|
||||
+list("numbers").u-text-small.o-no-block
|
||||
block
|
||||
|
|
|
@ -1,19 +1,15 @@
|
|||
//- 💫 INCLUDES > TOP NAVIGATION
|
||||
|
||||
include _mixins
|
||||
|
||||
nav.c-nav.u-text.js-nav(class=landing ? "c-nav--theme" : null)
|
||||
a(href='/') #[+logo]
|
||||
|
||||
if SUBSECTION != "index"
|
||||
.u-text-label.u-padding-small.u-hidden-xs=SUBSECTION
|
||||
a(href="/" aria-label=SITENAME) #[+logo]
|
||||
|
||||
ul.c-nav__menu
|
||||
- var NAV = ALPHA ? { "Usage": "/docs/usage", "Reference": "/docs/api" } : NAVIGATION
|
||||
|
||||
each url, item in NAV
|
||||
li.c-nav__menu__item(class=(url == "/") ? "u-hidden-xs" : null)
|
||||
- var current_url = '/' + current.path[0]
|
||||
each url, item in NAVIGATION
|
||||
li.c-nav__menu__item(class=(current_url == url) ? "is-active" : null)
|
||||
+a(url)=item
|
||||
|
||||
li.c-nav__menu__item
|
||||
+a(gh("spaCy"))(aria-label="GitHub").u-hidden-xs #[+icon("github", 20)]
|
||||
li.c-nav__menu__item.u-hidden-xs
|
||||
+a(gh("spaCy"))(aria-label="GitHub") #[+icon("github", 20)]
|
||||
|
||||
progress.c-progress.js-progress(value="0" max="1")
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
//- 💫 INCLUDES > NEWSLETTER
|
||||
|
||||
ul.o-block
|
||||
ul.o-block-small
|
||||
li.u-text-label.u-color-subtle Stay in the loop!
|
||||
li Receive updates about new releases, tutorials and more.
|
||||
|
||||
|
@ -10,7 +10,6 @@ form.o-grid#mc-embedded-subscribe-form(action="//#{MAILCHIMP.user}.list-manage.c
|
|||
div(style="position: absolute; left: -5000px;" aria-hidden="true")
|
||||
input(type="text" name="b_#{MAILCHIMP.id}_#{MAILCHIMP.list}" tabindex="-1" value="")
|
||||
|
||||
.o-grid-col.u-border.u-padding-small
|
||||
input#mce-EMAIL.u-text(type="email" name="EMAIL" placeholder="Your email")
|
||||
|
||||
button#mc-embedded-subscribe.u-text-label.u-color-theme(type="submit" name="subscribe") Sign up
|
||||
.o-grid-col.o-grid.o-grid--nowrap.o-field.u-padding-small
|
||||
input#mce-EMAIL.o-field__input.u-text(type="email" name="EMAIL" placeholder="Your email" aria-label="Your email")
|
||||
button#mc-embedded-subscribe.o-field__button.u-text-label.u-color-theme.u-nowrap(type="submit" name="subscribe") Sign up
|
||||
|
|
|
@ -1,47 +1,56 @@
|
|||
//- 💫 INCLUDES > DOCS PAGE TEMPLATE
|
||||
|
||||
- sidebar_content = (SUBSECTION != "index") ? public.docs[SUBSECTION]._data.sidebar : public.docs._data.sidebar || FOOTER
|
||||
- sidebar_content = (public[SECTION] ? public[SECTION]._data.sidebar : public._data[SECTION] ? public._data[SECTION].sidebar : false) || FOOTER
|
||||
|
||||
include _sidebar
|
||||
|
||||
main.o-main.o-main--sidebar.o-main--aside
|
||||
article.o-content
|
||||
+grid.o-no-block
|
||||
+grid-col(source ? "two-thirds" : "full")
|
||||
+h(1)=title
|
||||
+h(1).u-heading--title=title.replace("'", "’")
|
||||
if tag
|
||||
+tag=tag
|
||||
if tag_new
|
||||
+tag-new(tag_new)
|
||||
|
||||
if teaser
|
||||
.u-heading__teaser.u-text-small.u-color-dark=teaser
|
||||
else if IS_MODELS
|
||||
.u-heading__teaser.u-text-small.u-color-dark
|
||||
| Available statistical models for
|
||||
| #[code=current.source] (#{LANGUAGES[current.source]}).
|
||||
|
||||
if source
|
||||
+grid-col("third").u-text-right
|
||||
.o-inline-list
|
||||
+button(gh("spacy", source), false, "secondary").u-text-tag Source #[+icon("code", 14)]
|
||||
.o-block.u-text-right
|
||||
+button(gh("spacy", source), false, "secondary", "small").u-nowrap
|
||||
| Source #[+icon("code", 14)]
|
||||
|
||||
//-if ALPHA
|
||||
//- +alpha-info
|
||||
|
||||
if ALPHA
|
||||
+infobox("⚠️ You are viewing the spaCy v2.0.0 alpha docs")
|
||||
strong This page is part of the alpha documentation for spaCy v2.0.
|
||||
| It does not reflect the state of the latest stable release.
|
||||
| Because v2.0 is still under development, the implementation
|
||||
| may differ from the intended state described here. See the
|
||||
| #[+a(gh("spaCy") + "/releases/tag/v2.0.0-alpha") release notes]
|
||||
| for details on how to install and test the new version. To
|
||||
| read the official docs for spaCy v1.x,
|
||||
| #[+a("https://spacy.io/docs") go here].
|
||||
|
||||
if IS_MODELS
|
||||
include _page_models
|
||||
else
|
||||
!=yield
|
||||
|
||||
+grid.o-content.u-text
|
||||
+grid-col("half")
|
||||
if next && public.docs[SUBSECTION]._data[next]
|
||||
- data = public.docs[SUBSECTION]._data[next]
|
||||
|
||||
if !IS_MODELS
|
||||
.o-inline-list
|
||||
span #[strong.u-text-label Read next:] #[+a(next).u-link=data.title]
|
||||
+button(gh("spacy", "website/" + current.path.join('/') + ".jade"), false, "secondary", "small")
|
||||
| #[span.o-icon Suggest edits] #[+icon("code", 14)]
|
||||
|
||||
+grid-col("half").u-text-right
|
||||
.o-inline-list
|
||||
+button(gh("spacy", "website/" + current.path.join('/') + ".jade"), false, "secondary").u-text-tag Suggest edits #[+icon("code", 14)]
|
||||
if next && public[SECTION]._data[next]
|
||||
- data = public[SECTION]._data[next]
|
||||
|
||||
+grid("vcenter")
|
||||
+a(next).u-text-small.u-flex-full
|
||||
h4.u-text-label.u-color-dark Read next
|
||||
| #{data.title}
|
||||
|
||||
+a(next).c-icon-button.c-icon-button--right(aria-hidden="true")
|
||||
+icon("arrow-right", 24)
|
||||
|
||||
+gitter("spaCy chat")
|
||||
|
||||
|
|
77
website/_includes/_page_models.jade
Normal file
77
website/_includes/_page_models.jade
Normal file
|
@ -0,0 +1,77 @@
|
|||
//- 💫 INCLUDES > MODELS PAGE TEMPLATE
|
||||
|
||||
for id in CURRENT_MODELS
|
||||
+section(id)
|
||||
+grid("vcenter").o-no-block(id=id)
|
||||
+grid-col("two-thirds")
|
||||
+h(2)
|
||||
+a("#" + id).u-permalink=id
|
||||
|
||||
+grid-col("third").u-text-right
|
||||
.u-color-subtle.u-text-tiny
|
||||
+button(gh("spacy-models") + "/releases", true, "secondary", "small")(data-tpl=id data-tpl-key="download")
|
||||
| Release details
|
||||
.u-padding-small Latest: #[code(data-tpl=id data-tpl-key="version") n/a]
|
||||
|
||||
+aside-code("Installation", "bash", "$").
|
||||
spacy download #{id}
|
||||
|
||||
- var comps = getModelComponents(id)
|
||||
|
||||
p(data-tpl=id data-tpl-key="description")
|
||||
|
||||
div(data-tpl=id data-tpl-key="error" style="display: none")
|
||||
+infobox
|
||||
| Unable to load model details from GitHub. To find out more
|
||||
| about this model, see the overview of the
|
||||
| #[+a(gh("spacy-models") + "/releases") latest model releases].
|
||||
|
||||
+table(data-tpl=id data-tpl-key="table")
|
||||
+row
|
||||
+cell #[+label Language]
|
||||
+cell #[+tag=comps.lang] #{LANGUAGES[comps.lang]}
|
||||
for comp, label in {"Type": comps.type, "Genre": comps.genre}
|
||||
+row
|
||||
+cell #[+label=label]
|
||||
+cell #[+tag=comp] #{MODEL_META[comp]}
|
||||
+row
|
||||
+cell #[+label Size]
|
||||
+cell #[+tag=comps.size] #[span(data-tpl=id data-tpl-key="size") #[em n/a]]
|
||||
|
||||
each label in ["Pipeline", "Sources", "Author", "License"]
|
||||
- var field = label.toLowerCase()
|
||||
+row
|
||||
+cell.u-nowrap
|
||||
+label=label
|
||||
if MODEL_META[field]
|
||||
| #[+help(MODEL_META[field]).u-color-subtle]
|
||||
+cell
|
||||
span(data-tpl=id data-tpl-key=field) #[em n/a]
|
||||
|
||||
+row(data-tpl=id data-tpl-key="compat-wrapper" style="display: none")
|
||||
+cell
|
||||
+label Compat #[+help("Latest compatible model version for your spaCy installation").u-color-subtle]
|
||||
+cell
|
||||
.o-field.u-float-left
|
||||
select.o-field__select.u-text-small(data-tpl=id data-tpl-key="compat")
|
||||
.o-empty(data-tpl=id data-tpl-key="compat-versions")
|
||||
|
||||
section(data-tpl=id data-tpl-key="accuracy-wrapper" style="display: none")
|
||||
+grid.o-no-block
|
||||
+grid-col("third")
|
||||
+h(4) Accuracy
|
||||
+table.o-block-small
|
||||
for label, field in MODEL_ACCURACY
|
||||
+row(style="display: none")
|
||||
+cell.u-nowrap
|
||||
+label=label
|
||||
if MODEL_META[field]
|
||||
| #[+help(MODEL_META[field]).u-color-subtle]
|
||||
+cell.u-text-right(data-tpl=id data-tpl-key=field)
|
||||
| n/a
|
||||
|
||||
+grid-col("two-thirds")
|
||||
+h(4) Comparison
|
||||
+chart(id).u-padding-small
|
||||
|
||||
p.u-text-small.u-color-dark(data-tpl=id data-tpl-key="notes")
|
|
@ -1,27 +1,46 @@
|
|||
//- 💫 INCLUDES > SCRIPTS
|
||||
|
||||
script(src="/assets/js/main.js?v#{V_JS}")
|
||||
script(src="/assets/js/prism.js")
|
||||
|
||||
if SECTION == "docs"
|
||||
if quickstart
|
||||
script(src="/assets/js/quickstart.js")
|
||||
script var qs = new Quickstart("#qs")
|
||||
script(src="/assets/js/quickstart.min.js")
|
||||
|
||||
script.
|
||||
((window.gitter = {}).chat = {}).options = {
|
||||
useStyles: false,
|
||||
activationElement: '.js-gitter-button',
|
||||
targetElement: '.js-gitter',
|
||||
room: '!{SOCIAL.gitter}'
|
||||
};
|
||||
if IS_PAGE
|
||||
script(src="/assets/js/in-view.min.js")
|
||||
|
||||
script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer)
|
||||
if HAS_MODELS
|
||||
script(src="/assets/js/chart.min.js")
|
||||
|
||||
if environment == "deploy"
|
||||
script(async src="https://www.google-analytics.com/analytics.js")
|
||||
|
||||
script(src="/assets/js/prism.min.js")
|
||||
script(src="/assets/js/main.js?v#{V_JS}")
|
||||
|
||||
script
|
||||
| new ProgressBar('.js-progress');
|
||||
|
||||
if changelog
|
||||
| new Changelog('!{SOCIAL.github}', 'spacy');
|
||||
|
||||
if quickstart
|
||||
| new Quickstart("#qs");
|
||||
|
||||
if IS_PAGE
|
||||
| new SectionHighlighter('data-section', 'data-nav');
|
||||
| new GitHubEmbed('!{SOCIAL.github}', 'data-gh-embed');
|
||||
| ((window.gitter = {}).chat = {}).options = {
|
||||
| useStyles: false,
|
||||
| activationElement: '.js-gitter-button',
|
||||
| targetElement: '.js-gitter',
|
||||
| room: '!{SOCIAL.gitter}'
|
||||
| };
|
||||
|
||||
if HAS_MODELS
|
||||
| new ModelLoader('!{MODELS_REPO}', !{JSON.stringify(CURRENT_MODELS)}, !{JSON.stringify(MODEL_LICENSES)}, !{JSON.stringify(MODEL_ACCURACY)});
|
||||
|
||||
if environment == "deploy"
|
||||
| window.ga=window.ga||function(){
|
||||
| (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date;
|
||||
| ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview');
|
||||
|
||||
script(async src="https://www.google-analytics.com/analytics.js")
|
||||
if IS_PAGE
|
||||
script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer)
|
||||
|
|
|
@ -1,13 +1,23 @@
|
|||
//- 💫 INCLUDES > SIDEBAR
|
||||
|
||||
include _mixins
|
||||
|
||||
menu.c-sidebar.js-sidebar.u-text
|
||||
if sidebar_content
|
||||
each items, menu in sidebar_content
|
||||
ul.c-sidebar__section.o-block
|
||||
li.u-text-label.u-color-subtle=menu
|
||||
each items, sectiontitle in sidebar_content
|
||||
ul.c-sidebar__section.o-block-small
|
||||
li.u-text-label.u-color-dark=sectiontitle
|
||||
|
||||
each url, item in items
|
||||
li(class=(CURRENT == url || (CURRENT == "index" && url == "./")) ? "is-active" : null)
|
||||
+a(url)=item
|
||||
- var is_current = CURRENT == url || (CURRENT == "index" && url == "./")
|
||||
li.c-sidebar__item
|
||||
+a(url)(class=is_current ? "is-active" : null)=item
|
||||
|
||||
if is_current
|
||||
if IS_MODELS && CURRENT_MODELS.length
|
||||
- menu = Object.assign({}, ...CURRENT_MODELS.map(id => ({ [id]: id })))
|
||||
if menu
|
||||
ul.c-sidebar__crumb.u-hidden-sm
|
||||
- var counter = 0
|
||||
for id, title in menu
|
||||
- counter++
|
||||
li.c-sidebar__crumb__item(data-nav=id class=(counter == 1) ? "is-active" : null)
|
||||
+a("#section-" + id)=title
|
||||
|
|
157
website/_includes/_svg.jade
Normal file
157
website/_includes/_svg.jade
Normal file
File diff suppressed because one or more lines are too long
|
@ -2,11 +2,16 @@
|
|||
|
||||
include _includes/_mixins
|
||||
|
||||
- title = IS_MODELS ? LANGUAGES[current.source] || title : title
|
||||
- social_title = (SECTION == "index") ? SITENAME + " - " + SLOGAN : title + " - " + SITENAME
|
||||
- social_img = SITE_URL + "/assets/img/social/preview_" + (preview || ALPHA ? "alpha" : "default") + ".jpg"
|
||||
|
||||
doctype html
|
||||
html(lang="en")
|
||||
title
|
||||
if SECTION == "docs" && SUBSECTION && SUBSECTION != "index"
|
||||
| #{title} | #{SITENAME} #{SUBSECTION == "api" ? "API" : "Usage"} Documentation
|
||||
if SECTION == "api" || SECTION == "usage" || SECTION == "models"
|
||||
- var title_section = (SECTION == "api") ? "API" : SECTION.charAt(0).toUpperCase() + SECTION.slice(1)
|
||||
| #{title} | #{SITENAME} #{title_section} Documentation
|
||||
|
||||
else if SECTION != "index"
|
||||
| #{title} | #{SITENAME}
|
||||
|
@ -22,32 +27,30 @@ html(lang="en")
|
|||
meta(property="og:type" content="website")
|
||||
meta(property="og:site_name" content=sitename)
|
||||
meta(property="og:url" content="#{SITE_URL}/#{current.path.join('/')}")
|
||||
meta(property="og:title" content="#{title} - spaCy")
|
||||
meta(property="og:title" content=social_title)
|
||||
meta(property="og:description" content=description)
|
||||
meta(property="og:image" content=getSocialImg())
|
||||
meta(property="og:image" content=social_img)
|
||||
|
||||
meta(name="twitter:card" content="summary_large_image")
|
||||
meta(name="twitter:site" content="@" + SOCIAL.twitter)
|
||||
meta(name="twitter:title" content="#{title} - spaCy")
|
||||
meta(name="twitter:title" content=social_title)
|
||||
meta(name="twitter:description" content=description)
|
||||
meta(name="twitter:image" content=getSocialImg())
|
||||
meta(name="twitter:image" content=social_img)
|
||||
|
||||
link(rel="shortcut icon" href="/assets/img/favicon.ico")
|
||||
link(rel="icon" type="image/x-icon" href="/assets/img/favicon.ico")
|
||||
|
||||
if ALPHA && SECTION == "docs"
|
||||
if SECTION == "api"
|
||||
link(href="/assets/css/style_green.css?v#{V_CSS}" rel="stylesheet")
|
||||
|
||||
else if SUBSECTION == "usage"
|
||||
link(href="/assets/css/style_red.css?v#{V_CSS}" rel="stylesheet")
|
||||
|
||||
else
|
||||
link(href="/assets/css/style.css?v#{V_CSS}" rel="stylesheet")
|
||||
|
||||
body
|
||||
include _includes/_svg
|
||||
include _includes/_navigation
|
||||
|
||||
if SECTION == "docs"
|
||||
if !landing
|
||||
include _includes/_page-docs
|
||||
|
||||
else
|
||||
|
|
43
website/api/_annotation/_biluo.jade
Normal file
43
website/api/_annotation/_biluo.jade
Normal file
|
@ -0,0 +1,43 @@
|
|||
//- 💫 DOCS > API > ANNOTATION > BILUO
|
||||
|
||||
+table([ "Tag", "Description" ])
|
||||
+row
|
||||
+cell #[code #[span.u-color-theme B] EGIN]
|
||||
+cell The first token of a multi-token entity.
|
||||
|
||||
+row
|
||||
+cell #[code #[span.u-color-theme I] N]
|
||||
+cell An inner token of a multi-token entity.
|
||||
|
||||
+row
|
||||
+cell #[code #[span.u-color-theme L] AST]
|
||||
+cell The final token of a multi-token entity.
|
||||
|
||||
+row
|
||||
+cell #[code #[span.u-color-theme U] NIT]
|
||||
+cell A single-token entity.
|
||||
|
||||
+row
|
||||
+cell #[code #[span.u-color-theme O] UT]
|
||||
+cell A non-entity token.
|
||||
|
||||
+aside("Why BILUO, not IOB?")
|
||||
| There are several coding schemes for encoding entity annotations as
|
||||
| token tags. These coding schemes are equally expressive, but not
|
||||
| necessarily equally learnable.
|
||||
| #[+a("http://www.aclweb.org/anthology/W09-1119") Ratinov and Roth]
|
||||
| showed that the minimal #[strong Begin], #[strong In], #[strong Out]
|
||||
| scheme was more difficult to learn than the #[strong BILUO] scheme that
|
||||
| we use, which explicitly marks boundary tokens.
|
||||
|
||||
p
|
||||
| spaCy translates the character offsets into this scheme, in order to
|
||||
| decide the cost of each action given the current state of the entity
|
||||
| recogniser. The costs are then used to calculate the gradient of the
|
||||
| loss, to train the model. The exact algorithm is a pastiche of
|
||||
| well-known methods, and is not currently described in any single
|
||||
| publication. The model is a greedy transition-based parser guided by a
|
||||
| linear model whose weights are learned using the averaged perceptron
|
||||
| loss, via the #[+a("http://www.aclweb.org/anthology/C12-1059") dynamic oracle]
|
||||
| imitation learning strategy. The transition system is equivalent to the
|
||||
| BILOU tagging scheme.
|
115
website/api/_architecture/_cython.jade
Normal file
115
website/api/_architecture/_cython.jade
Normal file
|
@ -0,0 +1,115 @@
|
|||
//- 💫 DOCS > API > ARCHITECTURE > CYTHON
|
||||
|
||||
+aside("What's Cython?")
|
||||
| #[+a("http://cython.org/") Cython] is a language for writing
|
||||
| C extensions for Python. Most Python code is also valid Cython, but
|
||||
| you can add type declarations to get efficient memory-managed code
|
||||
| just like C or C++.
|
||||
|
||||
p
|
||||
| spaCy's core data structures are implemented as
|
||||
| #[+a("http://cython.org/") Cython] #[code cdef] classes. Memory is
|
||||
| managed through the #[+a(gh("cymem")) #[code cymem]]
|
||||
| #[code cymem.Pool] class, which allows you
|
||||
| to allocate memory which will be freed when the #[code Pool] object
|
||||
| is garbage collected. This means you usually don't have to worry
|
||||
| about freeing memory. You just have to decide which Python object
|
||||
| owns the memory, and make it own the #[code Pool]. When that object
|
||||
| goes out of scope, the memory will be freed. You do have to take
|
||||
| care that no pointers outlive the object that owns them — but this
|
||||
| is generally quite easy.
|
||||
|
||||
p
|
||||
| All Cython modules should have the #[code # cython: infer_types=True]
|
||||
| compiler directive at the top of the file. This makes the code much
|
||||
| cleaner, as it avoids the need for many type declarations. If
|
||||
| possible, you should prefer to declare your functions #[code nogil],
|
||||
| even if you don't especially care about multi-threading. The reason
|
||||
| is that #[code nogil] functions help the Cython compiler reason about
|
||||
| your code quite a lot — you're telling the compiler that no Python
|
||||
| dynamics are possible. This lets many errors be raised, and ensures
|
||||
| your function will run at C speed.
|
||||
|
||||
|
||||
p
|
||||
| Cython gives you many choices of sequences: you could have a Python
|
||||
| list, a numpy array, a memory view, a C++ vector, or a pointer.
|
||||
| Pointers are preferred, because they are fastest, have the most
|
||||
| explicit semantics, and let the compiler check your code more
|
||||
| strictly. C++ vectors are also great — but you should only use them
|
||||
| internally in functions. It's less friendly to accept a vector as an
|
||||
| argument, because that asks the user to do much more work. Here's
|
||||
| how to get a pointer from a numpy array, memory view or vector:
|
||||
|
||||
+code.
|
||||
cdef void get_pointers(np.ndarray[int, mode='c'] numpy_array, vector[int] cpp_vector, int[::1] memory_view) nogil:
|
||||
pointer1 = <int*>numpy_array.data
|
||||
pointer2 = cpp_vector.data()
|
||||
pointer3 = &memory_view[0]
|
||||
|
||||
p
|
||||
| Both C arrays and C++ vectors reassure the compiler that no Python
|
||||
| operations are possible on your variable. This is a big advantage:
|
||||
| it lets the Cython compiler raise many more errors for you.
|
||||
|
||||
p
|
||||
| When getting a pointer from a numpy array or memoryview, take care
|
||||
| that the data is actually stored in C-contiguous order — otherwise
|
||||
| you'll get a pointer to nonsense. The type-declarations in the code
|
||||
| above should generate runtime errors if buffers with incorrect
|
||||
| memory layouts are passed in. To iterate over the array, the
|
||||
| following style is preferred:
|
||||
|
||||
+code.
|
||||
cdef int c_total(const int* int_array, int length) nogil:
|
||||
total = 0
|
||||
for item in int_array[:length]:
|
||||
total += item
|
||||
return total
|
||||
|
||||
p
|
||||
| If this is confusing, consider that the compiler couldn't deal with
|
||||
| #[code for item in int_array:] — there's no length attached to a raw
|
||||
| pointer, so how could we figure out where to stop? The length is
|
||||
| provided in the slice notation as a solution to this. Note that we
|
||||
| don't have to declare the type of #[code item] in the code above —
|
||||
| the compiler can easily infer it. This gives us tidy code that looks
|
||||
| quite like Python, but is exactly as fast as C — because we've made
|
||||
| sure the compilation to C is trivial.
|
||||
|
||||
p
|
||||
| Your functions cannot be declared #[code nogil] if they need to
|
||||
| create Python objects or call Python functions. This is perfectly
|
||||
| okay — you shouldn't torture your code just to get #[code nogil]
|
||||
| functions. However, if your function isn't #[code nogil], you should
|
||||
| compile your module with #[code cython -a --cplus my_module.pyx] and
|
||||
| open the resulting #[code my_module.html] file in a browser. This
|
||||
| will let you see how Cython is compiling your code. Calls into the
|
||||
| Python run-time will be in bright yellow. This lets you easily see
|
||||
| whether Cython is able to correctly type your code, or whether there
|
||||
| are unexpected problems.
|
||||
|
||||
p
|
||||
| Working in Cython is very rewarding once you're over the initial
|
||||
| learning curve. As with C and C++, the first way you write something
|
||||
| in Cython will often be the performance-optimal approach. In
|
||||
| contrast, Python optimisation generally requires a lot of
|
||||
| experimentation. Is it faster to have an #[code if item in my_dict]
|
||||
| check, or to use #[code .get()]? What about
|
||||
| #[code try]/#[code except]? Does this numpy operation create a copy?
|
||||
| There's no way to guess the answers to these questions, and you'll
|
||||
| usually be dissatisfied with your results — so there's no way to
|
||||
| know when to stop this process. In the worst case, you'll make a
|
||||
| mess that invites the next reader to try their luck too. This is
|
||||
| like one of those
|
||||
| #[+a("http://www.wemjournal.org/article/S1080-6032%2809%2970088-2/abstract") volcanic gas-traps],
|
||||
| where the rescuers keep passing out from low oxygen, causing
|
||||
| another rescuer to follow — only to succumb themselves. In short,
|
||||
| just say no to optimizing your Python. If it's not fast enough the
|
||||
| first time, just switch to Cython.
|
||||
|
||||
+infobox("Resources")
|
||||
+list.o-no-block
|
||||
+item #[+a("http://docs.cython.org/en/latest/") Official Cython documentation] (cython.org)
|
||||
+item #[+a("https://explosion.ai/blog/writing-c-in-cython", true) Writing C in Cython] (explosion.ai)
|
||||
+item #[+a("https://explosion.ai/blog/multithreading-with-cython") Multi-threading spaCy’s parser and named entity recogniser] (explosion.ai)
|
141
website/api/_architecture/_nn-model.jade
Normal file
141
website/api/_architecture/_nn-model.jade
Normal file
|
@ -0,0 +1,141 @@
|
|||
//- 💫 DOCS > API > ARCHITECTURE > NN MODEL ARCHITECTURE
|
||||
|
||||
p
|
||||
| The parsing model is a blend of recent results. The two recent
|
||||
| inspirations have been the work of Eli Klipperwasser and Yoav Goldberg at
|
||||
| Bar Ilan#[+fn(1)], and the SyntaxNet team from Google. The foundation of
|
||||
| the parser is still based on the work of Joakim Nivre#[+fn(2)], who
|
||||
| introduced the transition-based framework#[+fn(3)], the arc-eager
|
||||
| transition system, and the imitation learning objective. The model is
|
||||
| implemented using #[+a(gh("thinc")) Thinc], spaCy's machine learning
|
||||
| library. We first predict context-sensitive vectors for each word in the
|
||||
| input:
|
||||
|
||||
+code.
|
||||
(embed_lower | embed_prefix | embed_suffix | embed_shape)
|
||||
>> Maxout(token_width)
|
||||
>> convolution ** 4
|
||||
|
||||
p
|
||||
| This convolutional layer is shared between the tagger, parser and NER,
|
||||
| and will also be shared by the future neural lemmatizer. Because the
|
||||
| parser shares these layers with the tagger, the parser does not require
|
||||
| tag features. I got this trick from David Weiss's "Stack Combination"
|
||||
| paper#[+fn(4)].
|
||||
|
||||
p
|
||||
| To boost the representation, the tagger actually predicts a "super tag"
|
||||
| with POS, morphology and dependency label#[+fn(5)]. The tagger predicts
|
||||
| these supertags by adding a softmax layer onto the convolutional layer –
|
||||
| so, we're teaching the convolutional layer to give us a representation
|
||||
| that's one affine transform from this informative lexical information.
|
||||
| This is obviously good for the parser (which backprops to the
|
||||
| convolutions too). The parser model makes a state vector by concatenating
|
||||
| the vector representations for its context tokens. The current context
|
||||
| tokens:
|
||||
|
||||
+table
|
||||
+row
|
||||
+cell #[code S0], #[code S1], #[code S2]
|
||||
+cell Top three words on the stack.
|
||||
|
||||
+row
|
||||
+cell #[code B0], #[code B1]
|
||||
+cell First two words of the buffer.
|
||||
|
||||
+row
|
||||
+cell.u-nowrap
|
||||
| #[code S0L1], #[code S1L1], #[code S2L1], #[code B0L1],
|
||||
| #[code B1L1]#[br]
|
||||
| #[code S0L2], #[code S1L2], #[code S2L2], #[code B0L2],
|
||||
| #[code B1L2]
|
||||
+cell
|
||||
| Leftmost and second leftmost children of #[code S0], #[code S1],
|
||||
| #[code S2], #[code B0] and #[code B1].
|
||||
|
||||
+row
|
||||
+cell.u-nowrap
|
||||
| #[code S0R1], #[code S1R1], #[code S2R1], #[code B0R1],
|
||||
| #[code B1R1]#[br]
|
||||
| #[code S0R2], #[code S1R2], #[code S2R2], #[code B0R2],
|
||||
| #[code B1R2]
|
||||
+cell
|
||||
| Rightmost and second rightmost children of #[code S0], #[code S1],
|
||||
| #[code S2], #[code B0] and #[code B1].
|
||||
|
||||
p
|
||||
| This makes the state vector quite long: #[code 13*T], where #[code T] is
|
||||
| the token vector width (128 is working well). Fortunately, there's a way
|
||||
| to structure the computation to save some expense (and make it more
|
||||
| GPU-friendly).
|
||||
|
||||
p
|
||||
| The parser typically visits #[code 2*N] states for a sentence of length
|
||||
| #[code N] (although it may visit more, if it back-tracks with a
|
||||
| non-monotonic transition#[+fn(4)]). A naive implementation would require
|
||||
| #[code 2*N (B, 13*T) @ (13*T, H)] matrix multiplications for a batch of
|
||||
| size #[code B]. We can instead perform one #[code (B*N, T) @ (T, 13*H)]
|
||||
| multiplication, to pre-compute the hidden weights for each positional
|
||||
| feature with respect to the words in the batch. (Note that our token
|
||||
| vectors come from the CNN — so we can't play this trick over the
|
||||
| vocabulary. That's how Stanford's NN parser#[+fn(3)] works — and why its
|
||||
| model is so big.)
|
||||
|
||||
p
|
||||
| This pre-computation strategy allows a nice compromise between
|
||||
| GPU-friendliness and implementation simplicity. The CNN and the wide
|
||||
| lower layer are computed on the GPU, and then the precomputed hidden
|
||||
| weights are moved to the CPU, before we start the transition-based
|
||||
| parsing process. This makes a lot of things much easier. We don't have to
|
||||
| worry about variable-length batch sizes, and we don't have to implement
|
||||
| the dynamic oracle in CUDA to train.
|
||||
|
||||
p
|
||||
| Currently the parser's loss function is multilabel log loss#[+fn(6)], as
|
||||
| the dynamic oracle allows multiple states to be 0 cost. This is defined
|
||||
| as follows, where #[code gZ] is the sum of the scores assigned to gold
|
||||
| classes:
|
||||
|
||||
+code.
|
||||
(exp(score) / Z) - (exp(score) / gZ)
|
||||
|
||||
+bibliography
|
||||
+item
|
||||
| #[+a("https://www.semanticscholar.org/paper/Simple-and-Accurate-Dependency-Parsing-Using-Bidir-Kiperwasser-Goldberg/3cf31ecb2724b5088783d7c96a5fc0d5604cbf41") Simple and Accurate Dependency Parsing Using Bidirectional LSTM Feature Representations]
|
||||
br
|
||||
| Eliyahu Kiperwasser, Yoav Goldberg. (2016)
|
||||
|
||||
+item
|
||||
| #[+a("https://www.semanticscholar.org/paper/A-Dynamic-Oracle-for-Arc-Eager-Dependency-Parsing-Goldberg-Nivre/22697256ec19ecc3e14fcfc63624a44cf9c22df4") A Dynamic Oracle for Arc-Eager Dependency Parsing]
|
||||
br
|
||||
| Yoav Goldberg, Joakim Nivre (2012)
|
||||
|
||||
+item
|
||||
| #[+a("https://explosion.ai/blog/parsing-english-in-python") Parsing English in 500 Lines of Python]
|
||||
br
|
||||
| Matthew Honnibal (2013)
|
||||
|
||||
+item
|
||||
| #[+a("https://www.semanticscholar.org/paper/Stack-propagation-Improved-Representation-Learning-Zhang-Weiss/0c133f79b23e8c680891d2e49a66f0e3d37f1466") Stack-propagation: Improved Representation Learning for Syntax]
|
||||
br
|
||||
| Yuan Zhang, David Weiss (2016)
|
||||
|
||||
+item
|
||||
| #[+a("https://www.semanticscholar.org/paper/Deep-multi-task-learning-with-low-level-tasks-supe-S%C3%B8gaard-Goldberg/03ad06583c9721855ccd82c3d969a01360218d86") Deep multi-task learning with low level tasks supervised at lower layers]
|
||||
br
|
||||
| Anders Søgaard, Yoav Goldberg (2016)
|
||||
|
||||
+item
|
||||
| #[+a("https://www.semanticscholar.org/paper/An-Improved-Non-monotonic-Transition-System-for-De-Honnibal-Johnson/4094cee47ade13b77b5ab4d2e6cb9dd2b8a2917c") An Improved Non-monotonic Transition System for Dependency Parsing]
|
||||
br
|
||||
| Matthew Honnibal, Mark Johnson (2015)
|
||||
|
||||
+item
|
||||
| #[+a("http://cs.stanford.edu/people/danqi/papers/emnlp2014.pdf") A Fast and Accurate Dependency Parser using Neural Networks]
|
||||
br
|
||||
| Danqi Cheng, Christopher D. Manning (2014)
|
||||
|
||||
+item
|
||||
| #[+a("https://www.semanticscholar.org/paper/Parsing-the-Wall-Street-Journal-using-a-Lexical-Fu-Riezler-King/0ad07862a91cd59b7eb5de38267e47725a62b8b2") Parsing the Wall Street Journal using a Lexical-Functional Grammar and Discriminative Estimation Techniques]
|
||||
br
|
||||
| Stefan Riezler et al. (2002)
|
|
@ -1,29 +1,32 @@
|
|||
{
|
||||
"sidebar": {
|
||||
"Introduction": {
|
||||
"Facts & Figures": "./",
|
||||
"Languages": "language-models",
|
||||
"Annotation Specs": "annotation"
|
||||
"Overview": {
|
||||
"Architecture": "./",
|
||||
"Annotation Specs": "annotation",
|
||||
"Functions": "top-level"
|
||||
},
|
||||
"Top-level": {
|
||||
"spacy": "spacy",
|
||||
"displacy": "displacy",
|
||||
"Utility Functions": "util",
|
||||
"Command line": "cli"
|
||||
},
|
||||
"Classes": {
|
||||
"Containers": {
|
||||
"Doc": "doc",
|
||||
"Token": "token",
|
||||
"Span": "span",
|
||||
"Lexeme": "lexeme"
|
||||
},
|
||||
|
||||
"Pipeline": {
|
||||
"Language": "language",
|
||||
"Tokenizer": "tokenizer",
|
||||
"Pipe": "pipe",
|
||||
"Tensorizer": "tensorizer",
|
||||
"Tagger": "tagger",
|
||||
"DependencyParser": "dependencyparser",
|
||||
"EntityRecognizer": "entityrecognizer",
|
||||
"TextCategorizer": "textcategorizer",
|
||||
"Tokenizer": "tokenizer",
|
||||
"Lemmatizer": "lemmatizer",
|
||||
"Matcher": "matcher",
|
||||
"Lexeme": "lexeme",
|
||||
"PhraseMatcher": "phrasematcher"
|
||||
},
|
||||
|
||||
"Other": {
|
||||
"Vocab": "vocab",
|
||||
"StringStore": "stringstore",
|
||||
"Vectors": "vectors",
|
||||
|
@ -34,52 +37,37 @@
|
|||
},
|
||||
|
||||
"index": {
|
||||
"title": "Facts & Figures",
|
||||
"next": "language-models"
|
||||
"title": "Architecture",
|
||||
"next": "annotation",
|
||||
"menu": {
|
||||
"Basics": "basics",
|
||||
"Neural Network Model": "nn-model",
|
||||
"Cython Conventions": "cython"
|
||||
}
|
||||
},
|
||||
|
||||
"language-models": {
|
||||
"title": "Languages",
|
||||
"next": "philosophy"
|
||||
},
|
||||
|
||||
"philosophy": {
|
||||
"title": "Philosophy"
|
||||
},
|
||||
|
||||
"spacy": {
|
||||
"title": "spaCy top-level functions",
|
||||
"source": "spacy/__init__.py",
|
||||
"next": "displacy"
|
||||
},
|
||||
|
||||
"displacy": {
|
||||
"title": "displaCy",
|
||||
"tag": "module",
|
||||
"source": "spacy/displacy",
|
||||
"next": "util"
|
||||
},
|
||||
|
||||
"util": {
|
||||
"title": "Utility Functions",
|
||||
"source": "spacy/util.py",
|
||||
"next": "cli"
|
||||
},
|
||||
|
||||
"cli": {
|
||||
"title": "Command Line Interface",
|
||||
"source": "spacy/cli"
|
||||
"top-level": {
|
||||
"title": "Top-level Functions",
|
||||
"menu": {
|
||||
"spacy": "spacy",
|
||||
"displacy": "displacy",
|
||||
"Utility Functions": "util",
|
||||
"Compatibility": "compat",
|
||||
"Command Line": "cli"
|
||||
}
|
||||
},
|
||||
|
||||
"language": {
|
||||
"title": "Language",
|
||||
"tag": "class",
|
||||
"teaser": "A text-processing pipeline.",
|
||||
"source": "spacy/language.py"
|
||||
},
|
||||
|
||||
"doc": {
|
||||
"title": "Doc",
|
||||
"tag": "class",
|
||||
"teaser": "A container for accessing linguistic annotations.",
|
||||
"source": "spacy/tokens/doc.pyx"
|
||||
},
|
||||
|
||||
|
@ -103,6 +91,7 @@
|
|||
|
||||
"vocab": {
|
||||
"title": "Vocab",
|
||||
"teaser": "A storage class for vocabulary and other data shared across a language.",
|
||||
"tag": "class",
|
||||
"source": "spacy/vocab.pyx"
|
||||
},
|
||||
|
@ -115,10 +104,27 @@
|
|||
|
||||
"matcher": {
|
||||
"title": "Matcher",
|
||||
"teaser": "Match sequences of tokens, based on pattern rules.",
|
||||
"tag": "class",
|
||||
"source": "spacy/matcher.pyx"
|
||||
},
|
||||
|
||||
"phrasematcher": {
|
||||
"title": "PhraseMatcher",
|
||||
"teaser": "Match sequences of tokens, based on documents.",
|
||||
"tag": "class",
|
||||
"tag_new": 2,
|
||||
"source": "spacy/matcher.pyx"
|
||||
},
|
||||
|
||||
"pipe": {
|
||||
"title": "Pipe",
|
||||
"teaser": "Abstract base class defining the API for pipeline components.",
|
||||
"tag": "class",
|
||||
"tag_new": 2,
|
||||
"source": "spacy/pipeline.pyx"
|
||||
},
|
||||
|
||||
"dependenyparser": {
|
||||
"title": "DependencyParser",
|
||||
"tag": "class",
|
||||
|
@ -127,18 +133,22 @@
|
|||
|
||||
"entityrecognizer": {
|
||||
"title": "EntityRecognizer",
|
||||
"teaser": "Annotate named entities on documents.",
|
||||
"tag": "class",
|
||||
"source": "spacy/pipeline.pyx"
|
||||
},
|
||||
|
||||
"textcategorizer": {
|
||||
"title": "TextCategorizer",
|
||||
"teaser": "Add text categorization models to spaCy pipelines.",
|
||||
"tag": "class",
|
||||
"tag_new": 2,
|
||||
"source": "spacy/pipeline.pyx"
|
||||
},
|
||||
|
||||
"dependencyparser": {
|
||||
"title": "DependencyParser",
|
||||
"teaser": "Annotate syntactic dependencies on documents.",
|
||||
"tag": "class",
|
||||
"source": "spacy/pipeline.pyx"
|
||||
},
|
||||
|
@ -149,15 +159,23 @@
|
|||
"source": "spacy/tokenizer.pyx"
|
||||
},
|
||||
|
||||
"lemmatizer": {
|
||||
"title": "Lemmatizer",
|
||||
"tag": "class"
|
||||
},
|
||||
|
||||
"tagger": {
|
||||
"title": "Tagger",
|
||||
"teaser": "Annotate part-of-speech tags on documents.",
|
||||
"tag": "class",
|
||||
"source": "spacy/pipeline.pyx"
|
||||
},
|
||||
|
||||
"tensorizer": {
|
||||
"title": "Tensorizer",
|
||||
"teaser": "Add a tensor with position-sensitive meaning representations to a document.",
|
||||
"tag": "class",
|
||||
"tag_new": 2,
|
||||
"source": "spacy/pipeline.pyx"
|
||||
},
|
||||
|
||||
|
@ -169,23 +187,38 @@
|
|||
|
||||
"goldcorpus": {
|
||||
"title": "GoldCorpus",
|
||||
"teaser": "An annotated corpus, using the JSON file format.",
|
||||
"tag": "class",
|
||||
"tag_new": 2,
|
||||
"source": "spacy/gold.pyx"
|
||||
},
|
||||
|
||||
"binder": {
|
||||
"title": "Binder",
|
||||
"tag": "class",
|
||||
"tag_new": 2,
|
||||
"source": "spacy/tokens/binder.pyx"
|
||||
},
|
||||
|
||||
"vectors": {
|
||||
"title": "Vectors",
|
||||
"teaser": "Store, save and load word vectors.",
|
||||
"tag": "class",
|
||||
"tag_new": 2,
|
||||
"source": "spacy/vectors.pyx"
|
||||
},
|
||||
|
||||
"annotation": {
|
||||
"title": "Annotation Specifications"
|
||||
"title": "Annotation Specifications",
|
||||
"teaser": "Schemes used for labels, tags and training data.",
|
||||
"menu": {
|
||||
"Tokenization": "tokenization",
|
||||
"Sentence Boundaries": "sbd",
|
||||
"POS Tagging": "pos-tagging",
|
||||
"Lemmatization": "lemmatization",
|
||||
"Dependencies": "dependency-parsing",
|
||||
"Named Entities": "named-entities",
|
||||
"Training Data": "training"
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,26 +1,17 @@
|
|||
//- 💫 DOCS > USAGE > COMMAND LINE INTERFACE
|
||||
|
||||
include ../../_includes/_mixins
|
||||
//- 💫 DOCS > API > TOP-LEVEL > COMMAND LINE INTERFACE
|
||||
|
||||
p
|
||||
| As of v1.7.0, spaCy comes with new command line helpers to download and
|
||||
| link models and show useful debugging information. For a list of available
|
||||
| commands, type #[code spacy --help].
|
||||
|
||||
+infobox("⚠️ Deprecation note")
|
||||
| As of spaCy 2.0, the #[code model] command to initialise a model data
|
||||
| directory is deprecated. The command was only necessary because previous
|
||||
| versions of spaCy expected a model directory to already be set up. This
|
||||
| has since been changed, so you can use the #[+api("cli#train") #[code train]]
|
||||
| command straight away.
|
||||
|
||||
+h(2, "download") Download
|
||||
+h(3, "download") Download
|
||||
|
||||
p
|
||||
| Download #[+a("/docs/usage/models") models] for spaCy. The downloader finds the
|
||||
| Download #[+a("/usage/models") models] for spaCy. The downloader finds the
|
||||
| best-matching compatible version, uses pip to download the model as a
|
||||
| package and automatically creates a
|
||||
| #[+a("/docs/usage/models#usage") shortcut link] to load the model by name.
|
||||
| #[+a("/usage/models#usage") shortcut link] to load the model by name.
|
||||
| Direct downloads don't perform any compatibility checks and require the
|
||||
| model name to be specified with its version (e.g., #[code en_core_web_sm-1.2.0]).
|
||||
|
||||
|
@ -49,15 +40,15 @@ p
|
|||
| detailed messages in case things go wrong. It's #[strong not recommended]
|
||||
| to use this command as part of an automated process. If you know which
|
||||
| model your project needs, you should consider a
|
||||
| #[+a("/docs/usage/models#download-pip") direct download via pip], or
|
||||
| #[+a("/usage/models#download-pip") direct download via pip], or
|
||||
| uploading the model to a local PyPi installation and fetching it straight
|
||||
| from there. This will also allow you to add it as a versioned package
|
||||
| dependency to your project.
|
||||
|
||||
+h(2, "link") Link
|
||||
+h(3, "link") Link
|
||||
|
||||
p
|
||||
| Create a #[+a("/docs/usage/models#usage") shortcut link] for a model,
|
||||
| Create a #[+a("/usage/models#usage") shortcut link] for a model,
|
||||
| either a Python package or a local directory. This will let you load
|
||||
| models from any location using a custom name via
|
||||
| #[+api("spacy#load") #[code spacy.load()]].
|
||||
|
@ -95,7 +86,7 @@ p
|
|||
+cell flag
|
||||
+cell Show help message and available arguments.
|
||||
|
||||
+h(2, "info") Info
|
||||
+h(3, "info") Info
|
||||
|
||||
p
|
||||
| Print information about your spaCy installation, models and local setup,
|
||||
|
@ -122,15 +113,15 @@ p
|
|||
+cell flag
|
||||
+cell Show help message and available arguments.
|
||||
|
||||
+h(2, "convert") Convert
|
||||
+h(3, "convert") Convert
|
||||
|
||||
p
|
||||
| Convert files into spaCy's #[+a("/docs/api/annotation#json-input") JSON format]
|
||||
| Convert files into spaCy's #[+a("/api/annotation#json-input") JSON format]
|
||||
| for use with the #[code train] command and other experiment management
|
||||
| functions. The right converter is chosen based on the file extension of
|
||||
| the input file. Currently only supports #[code .conllu].
|
||||
|
||||
+code(false, "bash", "$").
|
||||
+code(false, "bash", "$", false, false, true).
|
||||
spacy convert [input_file] [output_dir] [--n-sents] [--morphology]
|
||||
|
||||
+table(["Argument", "Type", "Description"])
|
||||
|
@ -159,14 +150,18 @@ p
|
|||
+cell flag
|
||||
+cell Show help message and available arguments.
|
||||
|
||||
+h(2, "train") Train
|
||||
+h(3, "train") Train
|
||||
|
||||
p
|
||||
| Train a model. Expects data in spaCy's
|
||||
| #[+a("/docs/api/annotation#json-input") JSON format].
|
||||
| #[+a("/api/annotation#json-input") JSON format]. On each epoch, a model
|
||||
| will be saved out to the directory. Accuracy scores and model details
|
||||
| will be added to a #[+a("/usage/training#models-generating") #[code meta.json]]
|
||||
| to allow packaging the model using the
|
||||
| #[+api("cli#package") #[code package]] command.
|
||||
|
||||
+code(false, "bash", "$").
|
||||
spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities]
|
||||
+code(false, "bash", "$", false, false, true).
|
||||
spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--meta-path] [--vectors] [--no-tagger] [--no-parser] [--no-entities] [--gold-preproc]
|
||||
|
||||
+table(["Argument", "Type", "Description"])
|
||||
+row
|
||||
|
@ -204,6 +199,27 @@ p
|
|||
+cell option
|
||||
+cell Use GPU.
|
||||
|
||||
+row
|
||||
+cell #[code --vectors], #[code -v]
|
||||
+cell option
|
||||
+cell Model to load vectors from.
|
||||
|
||||
+row
|
||||
+cell #[code --meta-path], #[code -m]
|
||||
+cell option
|
||||
+cell
|
||||
| #[+tag-new(2)] Optional path to model
|
||||
| #[+a("/usage/training#models-generating") #[code meta.json]].
|
||||
| All relevant properties like #[code lang], #[code pipeline] and
|
||||
| #[code spacy_version] will be overwritten.
|
||||
|
||||
+row
|
||||
+cell #[code --version], #[code -V]
|
||||
+cell option
|
||||
+cell
|
||||
| Model version. Will be written out to the model's
|
||||
| #[code meta.json] after training.
|
||||
|
||||
+row
|
||||
+cell #[code --no-tagger], #[code -T]
|
||||
+cell flag
|
||||
|
@ -219,12 +235,18 @@ p
|
|||
+cell flag
|
||||
+cell Don't train NER.
|
||||
|
||||
+row
|
||||
+cell #[code --gold-preproc], #[code -G]
|
||||
+cell flag
|
||||
+cell Use gold preprocessing.
|
||||
|
||||
+row
|
||||
+cell #[code --help], #[code -h]
|
||||
+cell flag
|
||||
+cell Show help message and available arguments.
|
||||
|
||||
+h(3, "train-hyperparams") Environment variables for hyperparameters
|
||||
+h(4, "train-hyperparams") Environment variables for hyperparameters
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| spaCy lets you set hyperparameters for training via environment variables.
|
||||
|
@ -236,98 +258,149 @@ p
|
|||
+code(false, "bash").
|
||||
parser_hidden_depth=2 parser_maxout_pieces=1 train-parser
|
||||
|
||||
+under-construction
|
||||
|
||||
+table(["Name", "Description", "Default"])
|
||||
+row
|
||||
+cell #[code dropout_from]
|
||||
+cell
|
||||
+cell Initial dropout rate.
|
||||
+cell #[code 0.2]
|
||||
|
||||
+row
|
||||
+cell #[code dropout_to]
|
||||
+cell
|
||||
+cell Final dropout rate.
|
||||
+cell #[code 0.2]
|
||||
|
||||
+row
|
||||
+cell #[code dropout_decay]
|
||||
+cell
|
||||
+cell Rate of dropout change.
|
||||
+cell #[code 0.0]
|
||||
|
||||
+row
|
||||
+cell #[code batch_from]
|
||||
+cell
|
||||
+cell Initial batch size.
|
||||
+cell #[code 1]
|
||||
|
||||
+row
|
||||
+cell #[code batch_to]
|
||||
+cell
|
||||
+cell Final batch size.
|
||||
+cell #[code 64]
|
||||
|
||||
+row
|
||||
+cell #[code batch_compound]
|
||||
+cell
|
||||
+cell Rate of batch size acceleration.
|
||||
+cell #[code 1.001]
|
||||
|
||||
+row
|
||||
+cell #[code token_vector_width]
|
||||
+cell
|
||||
+cell Width of embedding tables and convolutional layers.
|
||||
+cell #[code 128]
|
||||
|
||||
+row
|
||||
+cell #[code embed_size]
|
||||
+cell
|
||||
+cell Number of rows in embedding tables.
|
||||
+cell #[code 7500]
|
||||
|
||||
+row
|
||||
+cell #[code parser_maxout_pieces]
|
||||
+cell
|
||||
+cell Number of pieces in the parser's and NER's first maxout layer.
|
||||
+cell #[code 2]
|
||||
|
||||
+row
|
||||
+cell #[code parser_hidden_depth]
|
||||
+cell
|
||||
+cell Number of hidden layers in the parser and NER.
|
||||
+cell #[code 1]
|
||||
|
||||
+row
|
||||
+cell #[code hidden_width]
|
||||
+cell
|
||||
+cell Size of the parser's and NER's hidden layers.
|
||||
+cell #[code 128]
|
||||
|
||||
+row
|
||||
+cell #[code learn_rate]
|
||||
+cell
|
||||
+cell Learning rate.
|
||||
+cell #[code 0.001]
|
||||
|
||||
+row
|
||||
+cell #[code optimizer_B1]
|
||||
+cell
|
||||
+cell Momentum for the Adam solver.
|
||||
+cell #[code 0.9]
|
||||
|
||||
+row
|
||||
+cell #[code optimizer_B2]
|
||||
+cell
|
||||
+cell Adagrad-momentum for the Adam solver.
|
||||
+cell #[code 0.999]
|
||||
|
||||
+row
|
||||
+cell #[code optimizer_eps]
|
||||
+cell
|
||||
+cell Epsylon value for the Adam solver.
|
||||
+cell #[code 1e-08]
|
||||
|
||||
+row
|
||||
+cell #[code L2_penalty]
|
||||
+cell
|
||||
+cell L2 regularisation penalty.
|
||||
+cell #[code 1e-06]
|
||||
|
||||
+row
|
||||
+cell #[code grad_norm_clip]
|
||||
+cell
|
||||
+cell Gradient L2 norm constraint.
|
||||
+cell #[code 1.0]
|
||||
|
||||
+h(2, "package") Package
|
||||
+h(3, "evaluate") Evaluate
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| Generate a #[+a("/docs/usage/saving-loading#generating") model Python package]
|
||||
| Evaluate a model's accuracy and speed on JSON-formatted annotated data.
|
||||
| Will print the results and optionally export
|
||||
| #[+a("/usage/visualizers") displaCy visualizations] of a sample set of
|
||||
| parses to #[code .html] files. Visualizations for the dependency parse
|
||||
| and NER will be exported as separate files if the respective component
|
||||
| is present in the model's pipeline.
|
||||
|
||||
+code(false, "bash", "$", false, false, true).
|
||||
spacy evaluate [model] [data_path] [--displacy-path] [--displacy-limit] [--gpu-id] [--gold-preproc]
|
||||
|
||||
+table(["Argument", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code model]
|
||||
+cell positional
|
||||
+cell
|
||||
| Model to evaluate. Can be a package or shortcut link name, or a
|
||||
| path to a model data directory.
|
||||
|
||||
+row
|
||||
+cell #[code data_path]
|
||||
+cell positional
|
||||
+cell Location of JSON-formatted evaluation data.
|
||||
|
||||
+row
|
||||
+cell #[code --displacy-path], #[code -dp]
|
||||
+cell option
|
||||
+cell
|
||||
| Directory to output rendered parses as HTML. If not set, no
|
||||
| visualizations will be generated.
|
||||
|
||||
+row
|
||||
+cell #[code --displacy-limit], #[code -dl]
|
||||
+cell option
|
||||
+cell
|
||||
| Number of parses to generate per file. Defaults to #[code 25].
|
||||
| Keep in mind that a significantly higher number might cause the
|
||||
| #[code .html] files to render slowly.
|
||||
|
||||
+row
|
||||
+cell #[code --gpu-id], #[code -g]
|
||||
+cell option
|
||||
+cell GPU to use, if any. Defaults to #[code -1] for CPU.
|
||||
|
||||
+row
|
||||
+cell #[code --gold-preproc], #[code -G]
|
||||
+cell flag
|
||||
+cell Use gold preprocessing.
|
||||
|
||||
|
||||
+h(3, "package") Package
|
||||
|
||||
p
|
||||
| Generate a #[+a("/usage/training#models-generating") model Python package]
|
||||
| from an existing model data directory. All data files are copied over.
|
||||
| If the path to a meta.json is supplied, or a meta.json is found in the
|
||||
| input directory, this file is used. Otherwise, the data can be entered
|
||||
|
@ -336,8 +409,8 @@ p
|
|||
| sure you're always using the latest versions. This means you need to be
|
||||
| connected to the internet to use this command.
|
||||
|
||||
+code(false, "bash", "$").
|
||||
spacy package [input_dir] [output_dir] [--meta] [--force]
|
||||
+code(false, "bash", "$", false, false, true).
|
||||
spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] [--force]
|
||||
|
||||
+table(["Argument", "Type", "Description"])
|
||||
+row
|
||||
|
@ -353,14 +426,14 @@ p
|
|||
+row
|
||||
+cell #[code --meta-path], #[code -m]
|
||||
+cell option
|
||||
+cell Path to meta.json file (optional).
|
||||
+cell #[+tag-new(2)] Path to meta.json file (optional).
|
||||
|
||||
+row
|
||||
+cell #[code --create-meta], #[code -c]
|
||||
+cell flag
|
||||
+cell
|
||||
| Create a meta.json file on the command line, even if one already
|
||||
| exists in the directory.
|
||||
| #[+tag-new(2)] Create a meta.json file on the command line, even
|
||||
| if one already exists in the directory.
|
||||
|
||||
+row
|
||||
+cell #[code --force], #[code -f]
|
91
website/api/_top-level/_compat.jade
Normal file
91
website/api/_top-level/_compat.jade
Normal file
|
@ -0,0 +1,91 @@
|
|||
//- 💫 DOCS > API > TOP-LEVEL > COMPATIBILITY
|
||||
|
||||
p
|
||||
| All Python code is written in an
|
||||
| #[strong intersection of Python 2 and Python 3]. This is easy in Cython,
|
||||
| but somewhat ugly in Python. Logic that deals with Python or platform
|
||||
| compatibility only lives in #[code spacy.compat]. To distinguish them from
|
||||
| the builtin functions, replacement functions are suffixed with an
|
||||
| undersocre, e.e #[code unicode_]. For specific checks, spaCy uses the
|
||||
| #[code six] and #[code ftfy] packages.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.compat import unicode_, json_dumps
|
||||
|
||||
compatible_unicode = unicode_('hello world')
|
||||
compatible_json = json_dumps({'key': 'value'})
|
||||
|
||||
+table(["Name", "Python 2", "Python 3"])
|
||||
+row
|
||||
+cell #[code compat.bytes_]
|
||||
+cell #[code str]
|
||||
+cell #[code bytes]
|
||||
|
||||
+row
|
||||
+cell #[code compat.unicode_]
|
||||
+cell #[code unicode]
|
||||
+cell #[code str]
|
||||
|
||||
+row
|
||||
+cell #[code compat.basestring_]
|
||||
+cell #[code basestring]
|
||||
+cell #[code str]
|
||||
|
||||
+row
|
||||
+cell #[code compat.input_]
|
||||
+cell #[code raw_input]
|
||||
+cell #[code input]
|
||||
|
||||
+row
|
||||
+cell #[code compat.json_dumps]
|
||||
+cell #[code ujson.dumps] with #[code .decode('utf8')]
|
||||
+cell #[code ujson.dumps]
|
||||
|
||||
+row
|
||||
+cell #[code compat.path2str]
|
||||
+cell #[code str(path)] with #[code .decode('utf8')]
|
||||
+cell #[code str(path)]
|
||||
|
||||
+h(3, "is_config") compat.is_config
|
||||
+tag function
|
||||
|
||||
p
|
||||
| Check if a specific configuration of Python version and operating system
|
||||
| matches the user's setup. Mostly used to display targeted error messages.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.compat import is_config
|
||||
|
||||
if is_config(python2=True, windows=True):
|
||||
print("You are using Python 2 on Windows.")
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code python2]
|
||||
+cell bool
|
||||
+cell spaCy is executed with Python 2.x.
|
||||
|
||||
+row
|
||||
+cell #[code python3]
|
||||
+cell bool
|
||||
+cell spaCy is executed with Python 3.x.
|
||||
|
||||
+row
|
||||
+cell #[code windows]
|
||||
+cell bool
|
||||
+cell spaCy is executed on Windows.
|
||||
|
||||
+row
|
||||
+cell #[code linux]
|
||||
+cell bool
|
||||
+cell spaCy is executed on Linux.
|
||||
|
||||
+row
|
||||
+cell #[code osx]
|
||||
+cell bool
|
||||
+cell spaCy is executed on OS X or macOS.
|
||||
|
||||
+row("foot")
|
||||
+cell returns
|
||||
+cell bool
|
||||
+cell Whether the specified configuration matches the user's platform.
|
|
@ -1,14 +1,12 @@
|
|||
//- 💫 DOCS > API > DISPLACY
|
||||
|
||||
include ../../_includes/_mixins
|
||||
//- 💫 DOCS > API > TOP-LEVEL > DISPLACY
|
||||
|
||||
p
|
||||
| As of v2.0, spaCy comes with a built-in visualization suite. For more
|
||||
| info and examples, see the usage guide on
|
||||
| #[+a("/docs/usage/visualizers") visualizing spaCy].
|
||||
| #[+a("/usage/visualizers") visualizing spaCy].
|
||||
|
||||
|
||||
+h(2, "serve") displacy.serve
|
||||
+h(3, "displacy.serve") displacy.serve
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
||||
|
@ -60,7 +58,7 @@ p
|
|||
+cell bool
|
||||
+cell
|
||||
| Don't parse #[code Doc] and instead, expect a dict or list of
|
||||
| dicts. #[+a("/docs/usage/visualizers#manual-usage") See here]
|
||||
| dicts. #[+a("/usage/visualizers#manual-usage") See here]
|
||||
| for formats and examples.
|
||||
+cell #[code False]
|
||||
|
||||
|
@ -70,7 +68,7 @@ p
|
|||
+cell Port to serve visualization.
|
||||
+cell #[code 5000]
|
||||
|
||||
+h(2, "render") displacy.render
|
||||
+h(3, "displacy.render") displacy.render
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
||||
|
@ -127,24 +125,24 @@ p Render a dependency parse tree or named entity visualization.
|
|||
+cell bool
|
||||
+cell
|
||||
| Don't parse #[code Doc] and instead, expect a dict or list of
|
||||
| dicts. #[+a("/docs/usage/visualizers#manual-usage") See here]
|
||||
| dicts. #[+a("/usage/visualizers#manual-usage") See here]
|
||||
| for formats and examples.
|
||||
+cell #[code False]
|
||||
|
||||
+footrow
|
||||
+row("foot")
|
||||
+cell returns
|
||||
+cell unicode
|
||||
+cell Rendered HTML markup.
|
||||
+cell
|
||||
|
||||
+h(2, "options") Visualizer options
|
||||
+h(3, "displacy_options") Visualizer options
|
||||
|
||||
p
|
||||
| The #[code options] argument lets you specify additional settings for
|
||||
| each visualizer. If a setting is not present in the options, the default
|
||||
| value will be used.
|
||||
|
||||
+h(3, "options-dep") Dependency Visualizer options
|
||||
+h(4, "options-dep") Dependency Visualizer options
|
||||
|
||||
+aside-code("Example").
|
||||
options = {'compact': True, 'color': 'blue'}
|
||||
|
@ -219,7 +217,7 @@ p
|
|||
+cell Distance between words in px.
|
||||
+cell #[code 175] / #[code 85] (compact)
|
||||
|
||||
+h(3, "options-ent") Named Entity Visualizer options
|
||||
+h(4, "displacy_options-ent") Named Entity Visualizer options
|
||||
|
||||
+aside-code("Example").
|
||||
options = {'ents': ['PERSON', 'ORG', 'PRODUCT'],
|
||||
|
@ -244,6 +242,6 @@ p
|
|||
|
||||
p
|
||||
| By default, displaCy comes with colours for all
|
||||
| #[+a("/docs/api/annotation#named-entities") entity types supported by spaCy].
|
||||
| #[+a("/api/annotation#named-entities") entity types supported by spaCy].
|
||||
| If you're using custom entity types, you can use the #[code colors]
|
||||
| setting to add your own colours for them.
|
|
@ -1,15 +1,13 @@
|
|||
//- 💫 DOCS > API > SPACY
|
||||
//- 💫 DOCS > API > TOP-LEVEL > SPACY
|
||||
|
||||
include ../../_includes/_mixins
|
||||
|
||||
+h(2, "load") spacy.load
|
||||
+h(3, "spacy.load") spacy.load
|
||||
+tag function
|
||||
+tag-model
|
||||
|
||||
p
|
||||
| Load a model via its #[+a("/docs/usage/models#usage") shortcut link],
|
||||
| Load a model via its #[+a("/usage/models#usage") shortcut link],
|
||||
| the name of an installed
|
||||
| #[+a("/docs/usage/saving-loading#generating") model package], a unicode
|
||||
| #[+a("/usage/training#models-generating") model package], a unicode
|
||||
| path or a #[code Path]-like object. spaCy will try resolving the load
|
||||
| argument in this order. If a model is loaded from a shortcut link or
|
||||
| package name, spaCy will assume it's a Python package and import it and
|
||||
|
@ -38,25 +36,57 @@ p
|
|||
+cell list
|
||||
+cell
|
||||
| Names of pipeline components to
|
||||
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable].
|
||||
| #[+a("/usage/processing-pipelines#disabling") disable].
|
||||
|
||||
+footrow
|
||||
+row("foot")
|
||||
+cell returns
|
||||
+cell #[code Language]
|
||||
+cell A #[code Language] object with the loaded model.
|
||||
|
||||
+infobox("⚠️ Deprecation note")
|
||||
+infobox("Deprecation note", "⚠️")
|
||||
.o-block
|
||||
| As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy
|
||||
| will also raise an error if no model could be loaded and never just
|
||||
| return an empty #[code Language] object. If you need a blank language,
|
||||
| you need to import it explicitly (#[code from spacy.lang.en import English])
|
||||
| or use #[+api("util#get_lang_class") #[code util.get_lang_class]].
|
||||
| you can use the new function #[+api("spacy#blank") #[code spacy.blank()]]
|
||||
| or import the class explicitly, e.g.
|
||||
| #[code from spacy.lang.en import English].
|
||||
|
||||
+code-new nlp = spacy.load('/model')
|
||||
+code-old nlp = spacy.load('en', path='/model')
|
||||
|
||||
+h(2, "info") spacy.info
|
||||
+h(3, "spacy.blank") spacy.blank
|
||||
+tag function
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| Create a blank model of a given language class. This function is the
|
||||
| twin of #[code spacy.load()].
|
||||
|
||||
+aside-code("Example").
|
||||
nlp_en = spacy.blank('en')
|
||||
nlp_de = spacy.blank('de')
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code name]
|
||||
+cell unicode
|
||||
+cell ISO code of the language class to load.
|
||||
|
||||
+row
|
||||
+cell #[code disable]
|
||||
+cell list
|
||||
+cell
|
||||
| Names of pipeline components to
|
||||
| #[+a("/usage/processing-pipelines#disabling") disable].
|
||||
|
||||
+row("foot")
|
||||
+cell returns
|
||||
+cell #[code Language]
|
||||
+cell An empty #[code Language] object of the appropriate subclass.
|
||||
|
||||
|
||||
+h(4, "spacy.info") spacy.info
|
||||
+tag function
|
||||
|
||||
p
|
||||
|
@ -83,13 +113,13 @@ p
|
|||
+cell Print information as Markdown.
|
||||
|
||||
|
||||
+h(2, "explain") spacy.explain
|
||||
+h(3, "spacy.explain") spacy.explain
|
||||
+tag function
|
||||
|
||||
p
|
||||
| Get a description for a given POS tag, dependency label or entity type.
|
||||
| For a list of available terms, see
|
||||
| #[+src(gh("spacy", "spacy/glossary.py")) glossary.py].
|
||||
| #[+src(gh("spacy", "spacy/glossary.py")) #[code glossary.py]].
|
||||
|
||||
+aside-code("Example").
|
||||
spacy.explain('NORP')
|
||||
|
@ -107,18 +137,18 @@ p
|
|||
+cell unicode
|
||||
+cell Term to explain.
|
||||
|
||||
+footrow
|
||||
+row("foot")
|
||||
+cell returns
|
||||
+cell unicode
|
||||
+cell The explanation, or #[code None] if not found in the glossary.
|
||||
|
||||
+h(2, "set_factory") spacy.set_factory
|
||||
+h(3, "spacy.set_factory") spacy.set_factory
|
||||
+tag function
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| Set a factory that returns a custom
|
||||
| #[+a("/docs/usage/language-processing-pipeline") processing pipeline]
|
||||
| #[+a("/usage/processing-pipelines") processing pipeline]
|
||||
| component. Factories are useful for creating stateful components, especially ones which depend on shared data.
|
||||
|
||||
+aside-code("Example").
|
|
@ -1,10 +1,8 @@
|
|||
//- 💫 DOCS > API > UTIL
|
||||
|
||||
include ../../_includes/_mixins
|
||||
//- 💫 DOCS > API > TOP-LEVEL > UTIL
|
||||
|
||||
p
|
||||
| spaCy comes with a small collection of utility functions located in
|
||||
| #[+src(gh("spaCy", "spacy/util.py")) spacy/util.py].
|
||||
| #[+src(gh("spaCy", "spacy/util.py")) #[code spacy/util.py]].
|
||||
| Because utility functions are mostly intended for
|
||||
| #[strong internal use within spaCy], their behaviour may change with
|
||||
| future releases. The functions documented on this page should be safe
|
||||
|
@ -12,7 +10,7 @@ p
|
|||
| recommend having additional tests in place if your application depends on
|
||||
| any of spaCy's utilities.
|
||||
|
||||
+h(2, "get_data_path") util.get_data_path
|
||||
+h(3, "util.get_data_path") util.get_data_path
|
||||
+tag function
|
||||
|
||||
p
|
||||
|
@ -25,12 +23,12 @@ p
|
|||
+cell bool
|
||||
+cell Only return path if it exists, otherwise return #[code None].
|
||||
|
||||
+footrow
|
||||
+row("foot")
|
||||
+cell returns
|
||||
+cell #[code Path] / #[code None]
|
||||
+cell Data path or #[code None].
|
||||
|
||||
+h(2, "set_data_path") util.set_data_path
|
||||
+h(3, "util.set_data_path") util.set_data_path
|
||||
+tag function
|
||||
|
||||
p
|
||||
|
@ -47,12 +45,12 @@ p
|
|||
+cell unicode or #[code Path]
|
||||
+cell Path to new data directory.
|
||||
|
||||
+h(2, "get_lang_class") util.get_lang_class
|
||||
+h(3, "util.get_lang_class") util.get_lang_class
|
||||
+tag function
|
||||
|
||||
p
|
||||
| Import and load a #[code Language] class. Allows lazy-loading
|
||||
| #[+a("/docs/usage/adding-languages") language data] and importing
|
||||
| #[+a("/usage/adding-languages") language data] and importing
|
||||
| languages using the two-letter language code.
|
||||
|
||||
+aside-code("Example").
|
||||
|
@ -67,12 +65,12 @@ p
|
|||
+cell unicode
|
||||
+cell Two-letter language code, e.g. #[code 'en'].
|
||||
|
||||
+footrow
|
||||
+row("foot")
|
||||
+cell returns
|
||||
+cell #[code Language]
|
||||
+cell Language class.
|
||||
|
||||
+h(2, "load_model") util.load_model
|
||||
+h(3, "util.load_model") util.load_model
|
||||
+tag function
|
||||
+tag-new(2)
|
||||
|
||||
|
@ -101,12 +99,12 @@ p
|
|||
+cell -
|
||||
+cell Specific overrides, like pipeline components to disable.
|
||||
|
||||
+footrow
|
||||
+row("foot")
|
||||
+cell returns
|
||||
+cell #[code Language]
|
||||
+cell #[code Language] class with the loaded model.
|
||||
|
||||
+h(2, "load_model_from_path") util.load_model_from_path
|
||||
+h(3, "util.load_model_from_path") util.load_model_from_path
|
||||
+tag function
|
||||
+tag-new(2)
|
||||
|
||||
|
@ -139,18 +137,18 @@ p
|
|||
+cell -
|
||||
+cell Specific overrides, like pipeline components to disable.
|
||||
|
||||
+footrow
|
||||
+row("foot")
|
||||
+cell returns
|
||||
+cell #[code Language]
|
||||
+cell #[code Language] class with the loaded model.
|
||||
|
||||
+h(2, "load_model_from_init_py") util.load_model_from_init_py
|
||||
+h(3, "util.load_model_from_init_py") util.load_model_from_init_py
|
||||
+tag function
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| A helper function to use in the #[code load()] method of a model package's
|
||||
| #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) __init__.py].
|
||||
| #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) #[code __init__.py]].
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.util import load_model_from_init_py
|
||||
|
@ -169,12 +167,12 @@ p
|
|||
+cell -
|
||||
+cell Specific overrides, like pipeline components to disable.
|
||||
|
||||
+footrow
|
||||
+row("foot")
|
||||
+cell returns
|
||||
+cell #[code Language]
|
||||
+cell #[code Language] class with the loaded model.
|
||||
|
||||
+h(2, "get_model_meta") util.get_model_meta
|
||||
+h(3, "util.get_model_meta") util.get_model_meta
|
||||
+tag function
|
||||
+tag-new(2)
|
||||
|
||||
|
@ -190,17 +188,17 @@ p
|
|||
+cell unicode or #[code Path]
|
||||
+cell Path to model directory.
|
||||
|
||||
+footrow
|
||||
+row("foot")
|
||||
+cell returns
|
||||
+cell dict
|
||||
+cell The model's meta data.
|
||||
|
||||
+h(2, "is_package") util.is_package
|
||||
+h(3, "util.is_package") util.is_package
|
||||
+tag function
|
||||
|
||||
p
|
||||
| Check if string maps to a package installed via pip. Mainly used to
|
||||
| validate #[+a("/docs/usage/models") model packages].
|
||||
| validate #[+a("/usage/models") model packages].
|
||||
|
||||
+aside-code("Example").
|
||||
util.is_package('en_core_web_sm') # True
|
||||
|
@ -212,18 +210,18 @@ p
|
|||
+cell unicode
|
||||
+cell Name of package.
|
||||
|
||||
+footrow
|
||||
+row("foot")
|
||||
+cell returns
|
||||
+cell #[code bool]
|
||||
+cell #[code True] if installed package, #[code False] if not.
|
||||
|
||||
+h(2, "get_package_path") util.get_package_path
|
||||
+h(3, "util.get_package_path") util.get_package_path
|
||||
+tag function
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| Get path to an installed package. Mainly used to resolve the location of
|
||||
| #[+a("/docs/usage/models") model packages]. Currently imports the package
|
||||
| #[+a("/usage/models") model packages]. Currently imports the package
|
||||
| to find its path.
|
||||
|
||||
+aside-code("Example").
|
||||
|
@ -236,12 +234,12 @@ p
|
|||
+cell unicode
|
||||
+cell Name of installed package.
|
||||
|
||||
+footrow
|
||||
+row("foot")
|
||||
+cell returns
|
||||
+cell #[code Path]
|
||||
+cell Path to model package directory.
|
||||
|
||||
+h(2, "is_in_jupyter") util.is_in_jupyter
|
||||
+h(3, "util.is_in_jupyter") util.is_in_jupyter
|
||||
+tag function
|
||||
+tag-new(2)
|
||||
|
||||
|
@ -257,17 +255,17 @@ p
|
|||
return display(HTML(html))
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+row("foot")
|
||||
+cell returns
|
||||
+cell bool
|
||||
+cell #[code True] if in Jupyter, #[code False] if not.
|
||||
|
||||
+h(2, "update_exc") util.update_exc
|
||||
+h(3, "util.update_exc") util.update_exc
|
||||
+tag function
|
||||
|
||||
p
|
||||
| Update, validate and overwrite
|
||||
| #[+a("/docs/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions].
|
||||
| #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions].
|
||||
| Used to combine global exceptions with custom, language-specific
|
||||
| exceptions. Will raise an error if key doesn't match #[code ORTH] values.
|
||||
|
||||
|
@ -288,20 +286,20 @@ p
|
|||
+cell dicts
|
||||
+cell Exception dictionaries to add to the base exceptions, in order.
|
||||
|
||||
+footrow
|
||||
+row("foot")
|
||||
+cell returns
|
||||
+cell dict
|
||||
+cell Combined tokenizer exceptions.
|
||||
|
||||
|
||||
+h(2, "prints") util.prints
|
||||
+h(3, "util.prints") util.prints
|
||||
+tag function
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| Print a formatted, text-wrapped message with optional title. If a text
|
||||
| argument is a #[code Path], it's converted to a string. Should only
|
||||
| be used for interactive components like the #[+api("cli") cli].
|
||||
| be used for interactive components like the command-line interface.
|
||||
|
||||
+aside-code("Example").
|
||||
data_path = Path('/some/path')
|
131
website/api/annotation.jade
Normal file
131
website/api/annotation.jade
Normal file
|
@ -0,0 +1,131 @@
|
|||
//- 💫 DOCS > API > ANNOTATION SPECS
|
||||
|
||||
include ../_includes/_mixins
|
||||
|
||||
p This document describes the target annotations spaCy is trained to predict.
|
||||
|
||||
|
||||
+section("tokenization")
|
||||
+h(2, "tokenization") Tokenization
|
||||
|
||||
p
|
||||
| Tokenization standards are based on the
|
||||
| #[+a("https://catalog.ldc.upenn.edu/LDC2013T19") OntoNotes 5] corpus.
|
||||
| The tokenizer differs from most by including tokens for significant
|
||||
| whitespace. Any sequence of whitespace characters beyond a single space
|
||||
| (#[code ' ']) is included as a token.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.lang.en import English
|
||||
nlp = English()
|
||||
tokens = nlp('Some\nspaces and\ttab characters')
|
||||
tokens_text = [t.text for t in tokens]
|
||||
assert tokens_text == ['Some', '\n', 'spaces', ' ', 'and',
|
||||
'\t', 'tab', 'characters']
|
||||
|
||||
p
|
||||
| The whitespace tokens are useful for much the same reason punctuation is
|
||||
| – it's often an important delimiter in the text. By preserving it in the
|
||||
| token output, we are able to maintain a simple alignment between the
|
||||
| tokens and the original string, and we ensure that no information is
|
||||
| lost during processing.
|
||||
|
||||
+section("sbd")
|
||||
+h(2, "sentence-boundary") Sentence boundary detection
|
||||
|
||||
p
|
||||
| Sentence boundaries are calculated from the syntactic parse tree, so
|
||||
| features such as punctuation and capitalisation play an important but
|
||||
| non-decisive role in determining the sentence boundaries. Usually this
|
||||
| means that the sentence boundaries will at least coincide with clause
|
||||
| boundaries, even given poorly punctuated text.
|
||||
|
||||
+section("pos-tagging")
|
||||
+h(2, "pos-tagging") Part-of-speech Tagging
|
||||
|
||||
+aside("Tip: Understanding tags")
|
||||
| You can also use #[code spacy.explain()] to get the description for the
|
||||
| string representation of a tag. For example,
|
||||
| #[code spacy.explain("RB")] will return "adverb".
|
||||
|
||||
include _annotation/_pos-tags
|
||||
|
||||
+section("lemmatization")
|
||||
+h(2, "lemmatization") Lemmatization
|
||||
|
||||
p A "lemma" is the uninflected form of a word. In English, this means:
|
||||
|
||||
+list
|
||||
+item #[strong Adjectives]: The form like "happy", not "happier" or "happiest"
|
||||
+item #[strong Adverbs]: The form like "badly", not "worse" or "worst"
|
||||
+item #[strong Nouns]: The form like "dog", not "dogs"; like "child", not "children"
|
||||
+item #[strong Verbs]: The form like "write", not "writes", "writing", "wrote" or "written"
|
||||
|
||||
p
|
||||
| The lemmatization data is taken from
|
||||
| #[+a("https://wordnet.princeton.edu") WordNet]. However, we also add a
|
||||
| special case for pronouns: all pronouns are lemmatized to the special
|
||||
| token #[code -PRON-].
|
||||
|
||||
+infobox("About spaCy's custom pronoun lemma")
|
||||
| Unlike verbs and common nouns, there's no clear base form of a personal
|
||||
| pronoun. Should the lemma of "me" be "I", or should we normalize person
|
||||
| as well, giving "it" — or maybe "he"? spaCy's solution is to introduce a
|
||||
| novel symbol, #[code -PRON-], which is used as the lemma for
|
||||
| all personal pronouns.
|
||||
|
||||
+section("dependency-parsing")
|
||||
+h(2, "dependency-parsing") Syntactic Dependency Parsing
|
||||
|
||||
+aside("Tip: Understanding labels")
|
||||
| You can also use #[code spacy.explain()] to get the description for the
|
||||
| string representation of a label. For example,
|
||||
| #[code spacy.explain("prt")] will return "particle".
|
||||
|
||||
include _annotation/_dep-labels
|
||||
|
||||
+section("named-entities")
|
||||
+h(2, "named-entities") Named Entity Recognition
|
||||
|
||||
+aside("Tip: Understanding entity types")
|
||||
| You can also use #[code spacy.explain()] to get the description for the
|
||||
| string representation of an entity label. For example,
|
||||
| #[code spacy.explain("LANGUAGE")] will return "any named language".
|
||||
|
||||
include _annotation/_named-entities
|
||||
|
||||
+h(3, "biluo") BILUO Scheme
|
||||
|
||||
include _annotation/_biluo
|
||||
|
||||
+section("training")
|
||||
+h(2, "json-input") JSON input format for training
|
||||
|
||||
+under-construction
|
||||
|
||||
p spaCy takes training data in the following format:
|
||||
|
||||
+code("Example structure").
|
||||
doc: {
|
||||
id: string,
|
||||
paragraphs: [{
|
||||
raw: string,
|
||||
sents: [int],
|
||||
tokens: [{
|
||||
start: int,
|
||||
tag: string,
|
||||
head: int,
|
||||
dep: string
|
||||
}],
|
||||
ner: [{
|
||||
start: int,
|
||||
end: int,
|
||||
label: string
|
||||
}],
|
||||
brackets: [{
|
||||
start: int,
|
||||
end: int,
|
||||
label: string
|
||||
}]
|
||||
}]
|
||||
}
|
|
@ -1,6 +1,6 @@
|
|||
//- 💫 DOCS > API > BINDER
|
||||
|
||||
include ../../_includes/_mixins
|
||||
include ../_includes/_mixins
|
||||
|
||||
p A container class for serializing collections of #[code Doc] objects.
|
||||
|
5
website/api/dependencyparser.jade
Normal file
5
website/api/dependencyparser.jade
Normal file
|
@ -0,0 +1,5 @@
|
|||
//- 💫 DOCS > API > DEPENDENCYPARSER
|
||||
|
||||
include ../_includes/_mixins
|
||||
|
||||
!=partial("pipe", { subclass: "DependencyParser", short: "parser", pipeline_id: "parser" })
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user