mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
40e0da9cc1
|
@ -292,10 +292,9 @@ for example to show more specific error messages, you can use the `is_config()`
|
||||||
helper function.
|
helper function.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from .compat import unicode_, json_dumps, is_config
|
from .compat import unicode_, is_config
|
||||||
|
|
||||||
compatible_unicode = unicode_('hello world')
|
compatible_unicode = unicode_('hello world')
|
||||||
compatible_json = json_dumps({'key': 'value'})
|
|
||||||
if is_config(windows=True, python2=True):
|
if is_config(windows=True, python2=True):
|
||||||
print("You are using Python 2 on Windows.")
|
print("You are using Python 2 on Windows.")
|
||||||
```
|
```
|
||||||
|
|
2
Makefile
2
Makefile
|
@ -5,7 +5,7 @@ dist/spacy.pex : spacy/*.py* spacy/*/*.py*
|
||||||
python3.6 -m venv env3.6
|
python3.6 -m venv env3.6
|
||||||
source env3.6/bin/activate
|
source env3.6/bin/activate
|
||||||
env3.6/bin/pip install wheel
|
env3.6/bin/pip install wheel
|
||||||
env3.6/bin/pip install -r requirements.txt
|
env3.6/bin/pip install -r requirements.txt --no-cache-dir --no-binary :all:
|
||||||
env3.6/bin/python setup.py build_ext --inplace
|
env3.6/bin/python setup.py build_ext --inplace
|
||||||
env3.6/bin/python setup.py sdist
|
env3.6/bin/python setup.py sdist
|
||||||
env3.6/bin/python setup.py bdist_wheel
|
env3.6/bin/python setup.py bdist_wheel
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
<a href="https://explosion.ai"><img src="https://explosion.ai/assets/img/logo.svg" width="125" height="125" align="right" /></a>
|
||||||
|
|
||||||
# spaCy: Industrial-strength NLP
|
# spaCy: Industrial-strength NLP
|
||||||
|
|
||||||
spaCy is a library for advanced Natural Language Processing in Python and
|
spaCy is a library for advanced Natural Language Processing in Python and
|
||||||
|
|
|
@ -3,7 +3,7 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import bz2
|
import bz2
|
||||||
import regex as re
|
import regex as re
|
||||||
import ujson
|
import srsly
|
||||||
import sys
|
import sys
|
||||||
import random
|
import random
|
||||||
import datetime
|
import datetime
|
||||||
|
@ -44,7 +44,7 @@ class Reddit(object):
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if not line:
|
if not line:
|
||||||
continue
|
continue
|
||||||
comment = ujson.loads(line)
|
comment = srsly.json_loads(line)
|
||||||
if self.is_valid(comment):
|
if self.is_valid(comment):
|
||||||
text = self.strip_tags(comment["body"])
|
text = self.strip_tags(comment["body"])
|
||||||
yield {"text": text}
|
yield {"text": text}
|
||||||
|
@ -75,7 +75,7 @@ class Reddit(object):
|
||||||
def main(path):
|
def main(path):
|
||||||
reddit = Reddit(path)
|
reddit = Reddit(path)
|
||||||
for comment in reddit:
|
for comment in reddit:
|
||||||
print(ujson.dumps(comment))
|
print(srsly.json_dumps(comment))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -1,5 +1,12 @@
|
||||||
"""
|
"""
|
||||||
This example shows how to use an LSTM sentiment classification model trained using Keras in spaCy. spaCy splits the document into sentences, and each sentence is classified using the LSTM. The scores for the sentences are then aggregated to give the document score. This kind of hierarchical model is quite difficult in "pure" Keras or Tensorflow, but it's very effective. The Keras example on this dataset performs quite poorly, because it cuts off the documents so that they're a fixed size. This hurts review accuracy a lot, because people often summarise their rating in the final sentence
|
This example shows how to use an LSTM sentiment classification model trained
|
||||||
|
using Keras in spaCy. spaCy splits the document into sentences, and each
|
||||||
|
sentence is classified using the LSTM. The scores for the sentences are then
|
||||||
|
aggregated to give the document score. This kind of hierarchical model is quite
|
||||||
|
difficult in "pure" Keras or Tensorflow, but it's very effective. The Keras
|
||||||
|
example on this dataset performs quite poorly, because it cuts off the documents
|
||||||
|
so that they're a fixed size. This hurts review accuracy a lot, because people
|
||||||
|
often summarise their rating in the final sentence
|
||||||
|
|
||||||
Prerequisites:
|
Prerequisites:
|
||||||
spacy download en_vectors_web_lg
|
spacy download en_vectors_web_lg
|
||||||
|
@ -25,9 +32,9 @@ import spacy
|
||||||
class SentimentAnalyser(object):
|
class SentimentAnalyser(object):
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, path, nlp, max_length=100):
|
def load(cls, path, nlp, max_length=100):
|
||||||
with (path / 'config.json').open() as file_:
|
with (path / "config.json").open() as file_:
|
||||||
model = model_from_json(file_.read())
|
model = model_from_json(file_.read())
|
||||||
with (path / 'model').open('rb') as file_:
|
with (path / "model").open("rb") as file_:
|
||||||
lstm_weights = pickle.load(file_)
|
lstm_weights = pickle.load(file_)
|
||||||
embeddings = get_embeddings(nlp.vocab)
|
embeddings = get_embeddings(nlp.vocab)
|
||||||
model.set_weights([embeddings] + lstm_weights)
|
model.set_weights([embeddings] + lstm_weights)
|
||||||
|
@ -69,12 +76,12 @@ def get_labelled_sentences(docs, doc_labels):
|
||||||
for sent in doc.sents:
|
for sent in doc.sents:
|
||||||
sentences.append(sent)
|
sentences.append(sent)
|
||||||
labels.append(y)
|
labels.append(y)
|
||||||
return sentences, numpy.asarray(labels, dtype='int32')
|
return sentences, numpy.asarray(labels, dtype="int32")
|
||||||
|
|
||||||
|
|
||||||
def get_features(docs, max_length):
|
def get_features(docs, max_length):
|
||||||
docs = list(docs)
|
docs = list(docs)
|
||||||
Xs = numpy.zeros((len(docs), max_length), dtype='int32')
|
Xs = numpy.zeros((len(docs), max_length), dtype="int32")
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
j = 0
|
j = 0
|
||||||
for token in doc:
|
for token in doc:
|
||||||
|
@ -89,16 +96,25 @@ def get_features(docs, max_length):
|
||||||
return Xs
|
return Xs
|
||||||
|
|
||||||
|
|
||||||
def train(train_texts, train_labels, dev_texts, dev_labels,
|
def train(
|
||||||
lstm_shape, lstm_settings, lstm_optimizer, batch_size=100,
|
train_texts,
|
||||||
nb_epoch=5, by_sentence=True):
|
train_labels,
|
||||||
|
dev_texts,
|
||||||
|
dev_labels,
|
||||||
|
lstm_shape,
|
||||||
|
lstm_settings,
|
||||||
|
lstm_optimizer,
|
||||||
|
batch_size=100,
|
||||||
|
nb_epoch=5,
|
||||||
|
by_sentence=True,
|
||||||
|
):
|
||||||
|
|
||||||
print("Loading spaCy")
|
print("Loading spaCy")
|
||||||
nlp = spacy.load('en_vectors_web_lg')
|
nlp = spacy.load("en_vectors_web_lg")
|
||||||
nlp.add_pipe(nlp.create_pipe('sentencizer'))
|
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
||||||
embeddings = get_embeddings(nlp.vocab)
|
embeddings = get_embeddings(nlp.vocab)
|
||||||
model = compile_lstm(embeddings, lstm_shape, lstm_settings)
|
model = compile_lstm(embeddings, lstm_shape, lstm_settings)
|
||||||
|
|
||||||
print("Parsing texts...")
|
print("Parsing texts...")
|
||||||
train_docs = list(nlp.pipe(train_texts))
|
train_docs = list(nlp.pipe(train_texts))
|
||||||
dev_docs = list(nlp.pipe(dev_texts))
|
dev_docs = list(nlp.pipe(dev_texts))
|
||||||
|
@ -106,10 +122,15 @@ def train(train_texts, train_labels, dev_texts, dev_labels,
|
||||||
train_docs, train_labels = get_labelled_sentences(train_docs, train_labels)
|
train_docs, train_labels = get_labelled_sentences(train_docs, train_labels)
|
||||||
dev_docs, dev_labels = get_labelled_sentences(dev_docs, dev_labels)
|
dev_docs, dev_labels = get_labelled_sentences(dev_docs, dev_labels)
|
||||||
|
|
||||||
train_X = get_features(train_docs, lstm_shape['max_length'])
|
train_X = get_features(train_docs, lstm_shape["max_length"])
|
||||||
dev_X = get_features(dev_docs, lstm_shape['max_length'])
|
dev_X = get_features(dev_docs, lstm_shape["max_length"])
|
||||||
model.fit(train_X, train_labels, validation_data=(dev_X, dev_labels),
|
model.fit(
|
||||||
epochs=nb_epoch, batch_size=batch_size)
|
train_X,
|
||||||
|
train_labels,
|
||||||
|
validation_data=(dev_X, dev_labels),
|
||||||
|
epochs=nb_epoch,
|
||||||
|
batch_size=batch_size,
|
||||||
|
)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
@ -119,19 +140,28 @@ def compile_lstm(embeddings, shape, settings):
|
||||||
Embedding(
|
Embedding(
|
||||||
embeddings.shape[0],
|
embeddings.shape[0],
|
||||||
embeddings.shape[1],
|
embeddings.shape[1],
|
||||||
input_length=shape['max_length'],
|
input_length=shape["max_length"],
|
||||||
trainable=False,
|
trainable=False,
|
||||||
weights=[embeddings],
|
weights=[embeddings],
|
||||||
mask_zero=True
|
mask_zero=True,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
model.add(TimeDistributed(Dense(shape['nr_hidden'], use_bias=False)))
|
model.add(TimeDistributed(Dense(shape["nr_hidden"], use_bias=False)))
|
||||||
model.add(Bidirectional(LSTM(shape['nr_hidden'],
|
model.add(
|
||||||
recurrent_dropout=settings['dropout'],
|
Bidirectional(
|
||||||
dropout=settings['dropout'])))
|
LSTM(
|
||||||
model.add(Dense(shape['nr_class'], activation='sigmoid'))
|
shape["nr_hidden"],
|
||||||
model.compile(optimizer=Adam(lr=settings['lr']), loss='binary_crossentropy',
|
recurrent_dropout=settings["dropout"],
|
||||||
metrics=['accuracy'])
|
dropout=settings["dropout"],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
model.add(Dense(shape["nr_class"], activation="sigmoid"))
|
||||||
|
model.compile(
|
||||||
|
optimizer=Adam(lr=settings["lr"]),
|
||||||
|
loss="binary_crossentropy",
|
||||||
|
metrics=["accuracy"],
|
||||||
|
)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
@ -140,8 +170,8 @@ def get_embeddings(vocab):
|
||||||
|
|
||||||
|
|
||||||
def evaluate(model_dir, texts, labels, max_length=100):
|
def evaluate(model_dir, texts, labels, max_length=100):
|
||||||
nlp = spacy.load('en_vectors_web_lg')
|
nlp = spacy.load("en_vectors_web_lg")
|
||||||
nlp.add_pipe(nlp.create_pipe('sentencizer'))
|
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
||||||
nlp.add_pipe(SentimentAnalyser.load(model_dir, nlp, max_length=max_length))
|
nlp.add_pipe(SentimentAnalyser.load(model_dir, nlp, max_length=max_length))
|
||||||
|
|
||||||
correct = 0
|
correct = 0
|
||||||
|
@ -154,7 +184,7 @@ def evaluate(model_dir, texts, labels, max_length=100):
|
||||||
|
|
||||||
def read_data(data_dir, limit=0):
|
def read_data(data_dir, limit=0):
|
||||||
examples = []
|
examples = []
|
||||||
for subdir, label in (('pos', 1), ('neg', 0)):
|
for subdir, label in (("pos", 1), ("neg", 0)):
|
||||||
for filename in (data_dir / subdir).iterdir():
|
for filename in (data_dir / subdir).iterdir():
|
||||||
with filename.open() as file_:
|
with filename.open() as file_:
|
||||||
text = file_.read()
|
text = file_.read()
|
||||||
|
@ -162,7 +192,7 @@ def read_data(data_dir, limit=0):
|
||||||
random.shuffle(examples)
|
random.shuffle(examples)
|
||||||
if limit >= 1:
|
if limit >= 1:
|
||||||
examples = examples[:limit]
|
examples = examples[:limit]
|
||||||
return zip(*examples) # Unzips into two lists
|
return zip(*examples) # Unzips into two lists
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
|
@ -176,13 +206,21 @@ def read_data(data_dir, limit=0):
|
||||||
learn_rate=("Learn rate", "option", "e", float),
|
learn_rate=("Learn rate", "option", "e", float),
|
||||||
nb_epoch=("Number of training epochs", "option", "i", int),
|
nb_epoch=("Number of training epochs", "option", "i", int),
|
||||||
batch_size=("Size of minibatches for training LSTM", "option", "b", int),
|
batch_size=("Size of minibatches for training LSTM", "option", "b", int),
|
||||||
nr_examples=("Limit to N examples", "option", "n", int)
|
nr_examples=("Limit to N examples", "option", "n", int),
|
||||||
)
|
)
|
||||||
def main(model_dir=None, train_dir=None, dev_dir=None,
|
def main(
|
||||||
is_runtime=False,
|
model_dir=None,
|
||||||
nr_hidden=64, max_length=100, # Shape
|
train_dir=None,
|
||||||
dropout=0.5, learn_rate=0.001, # General NN config
|
dev_dir=None,
|
||||||
nb_epoch=5, batch_size=256, nr_examples=-1): # Training params
|
is_runtime=False,
|
||||||
|
nr_hidden=64,
|
||||||
|
max_length=100, # Shape
|
||||||
|
dropout=0.5,
|
||||||
|
learn_rate=0.001, # General NN config
|
||||||
|
nb_epoch=5,
|
||||||
|
batch_size=256,
|
||||||
|
nr_examples=-1,
|
||||||
|
): # Training params
|
||||||
if model_dir is not None:
|
if model_dir is not None:
|
||||||
model_dir = pathlib.Path(model_dir)
|
model_dir = pathlib.Path(model_dir)
|
||||||
if train_dir is None or dev_dir is None:
|
if train_dir is None or dev_dir is None:
|
||||||
|
@ -204,20 +242,26 @@ def main(model_dir=None, train_dir=None, dev_dir=None,
|
||||||
dev_texts, dev_labels = zip(*imdb_data[1])
|
dev_texts, dev_labels = zip(*imdb_data[1])
|
||||||
else:
|
else:
|
||||||
dev_texts, dev_labels = read_data(dev_dir, imdb_data, limit=nr_examples)
|
dev_texts, dev_labels = read_data(dev_dir, imdb_data, limit=nr_examples)
|
||||||
train_labels = numpy.asarray(train_labels, dtype='int32')
|
train_labels = numpy.asarray(train_labels, dtype="int32")
|
||||||
dev_labels = numpy.asarray(dev_labels, dtype='int32')
|
dev_labels = numpy.asarray(dev_labels, dtype="int32")
|
||||||
lstm = train(train_texts, train_labels, dev_texts, dev_labels,
|
lstm = train(
|
||||||
{'nr_hidden': nr_hidden, 'max_length': max_length, 'nr_class': 1},
|
train_texts,
|
||||||
{'dropout': dropout, 'lr': learn_rate},
|
train_labels,
|
||||||
{},
|
dev_texts,
|
||||||
nb_epoch=nb_epoch, batch_size=batch_size)
|
dev_labels,
|
||||||
|
{"nr_hidden": nr_hidden, "max_length": max_length, "nr_class": 1},
|
||||||
|
{"dropout": dropout, "lr": learn_rate},
|
||||||
|
{},
|
||||||
|
nb_epoch=nb_epoch,
|
||||||
|
batch_size=batch_size,
|
||||||
|
)
|
||||||
weights = lstm.get_weights()
|
weights = lstm.get_weights()
|
||||||
if model_dir is not None:
|
if model_dir is not None:
|
||||||
with (model_dir / 'model').open('wb') as file_:
|
with (model_dir / "model").open("wb") as file_:
|
||||||
pickle.dump(weights[1:], file_)
|
pickle.dump(weights[1:], file_)
|
||||||
with (model_dir / 'config.json').open('w') as file_:
|
with (model_dir / "config.json").open("w") as file_:
|
||||||
file_.write(lstm.to_json())
|
file_.write(lstm.to_json())
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
plac.call(main)
|
plac.call(main)
|
||||||
|
|
|
@ -15,14 +15,15 @@ import spacy
|
||||||
|
|
||||||
|
|
||||||
TEXTS = [
|
TEXTS = [
|
||||||
'Net income was $9.4 million compared to the prior year of $2.7 million.',
|
"Net income was $9.4 million compared to the prior year of $2.7 million.",
|
||||||
'Revenue exceeded twelve billion dollars, with a loss of $1b.',
|
"Revenue exceeded twelve billion dollars, with a loss of $1b.",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
model=("Model to load (needs parser and NER)", "positional", None, str))
|
model=("Model to load (needs parser and NER)", "positional", None, str)
|
||||||
def main(model='en_core_web_sm'):
|
)
|
||||||
|
def main(model="en_core_web_sm"):
|
||||||
nlp = spacy.load(model)
|
nlp = spacy.load(model)
|
||||||
print("Loaded model '%s'" % model)
|
print("Loaded model '%s'" % model)
|
||||||
print("Processing %d texts" % len(TEXTS))
|
print("Processing %d texts" % len(TEXTS))
|
||||||
|
@ -31,7 +32,7 @@ def main(model='en_core_web_sm'):
|
||||||
doc = nlp(text)
|
doc = nlp(text)
|
||||||
relations = extract_currency_relations(doc)
|
relations = extract_currency_relations(doc)
|
||||||
for r1, r2 in relations:
|
for r1, r2 in relations:
|
||||||
print('{:<10}\t{}\t{}'.format(r1.text, r2.ent_type_, r2.text))
|
print("{:<10}\t{}\t{}".format(r1.text, r2.ent_type_, r2.text))
|
||||||
|
|
||||||
|
|
||||||
def extract_currency_relations(doc):
|
def extract_currency_relations(doc):
|
||||||
|
@ -41,18 +42,18 @@ def extract_currency_relations(doc):
|
||||||
span.merge()
|
span.merge()
|
||||||
|
|
||||||
relations = []
|
relations = []
|
||||||
for money in filter(lambda w: w.ent_type_ == 'MONEY', doc):
|
for money in filter(lambda w: w.ent_type_ == "MONEY", doc):
|
||||||
if money.dep_ in ('attr', 'dobj'):
|
if money.dep_ in ("attr", "dobj"):
|
||||||
subject = [w for w in money.head.lefts if w.dep_ == 'nsubj']
|
subject = [w for w in money.head.lefts if w.dep_ == "nsubj"]
|
||||||
if subject:
|
if subject:
|
||||||
subject = subject[0]
|
subject = subject[0]
|
||||||
relations.append((subject, money))
|
relations.append((subject, money))
|
||||||
elif money.dep_ == 'pobj' and money.head.dep_ == 'prep':
|
elif money.dep_ == "pobj" and money.head.dep_ == "prep":
|
||||||
relations.append((money.head.head, money))
|
relations.append((money.head.head, money))
|
||||||
return relations
|
return relations
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
plac.call(main)
|
plac.call(main)
|
||||||
|
|
||||||
# Expected output:
|
# Expected output:
|
||||||
|
|
|
@ -24,37 +24,39 @@ import plac
|
||||||
import spacy
|
import spacy
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(model=("Model to load", "positional", None, str))
|
||||||
model=("Model to load", "positional", None, str))
|
def main(model="en_core_web_sm"):
|
||||||
def main(model='en_core_web_sm'):
|
|
||||||
nlp = spacy.load(model)
|
nlp = spacy.load(model)
|
||||||
print("Loaded model '%s'" % model)
|
print("Loaded model '%s'" % model)
|
||||||
|
|
||||||
doc = nlp("displaCy uses CSS and JavaScript to show you how computers "
|
doc = nlp(
|
||||||
"understand language")
|
"displaCy uses CSS and JavaScript to show you how computers "
|
||||||
|
"understand language"
|
||||||
|
)
|
||||||
|
|
||||||
# The easiest way is to find the head of the subtree you want, and then use
|
# The easiest way is to find the head of the subtree you want, and then use
|
||||||
# the `.subtree`, `.children`, `.lefts` and `.rights` iterators. `.subtree`
|
# the `.subtree`, `.children`, `.lefts` and `.rights` iterators. `.subtree`
|
||||||
# is the one that does what you're asking for most directly:
|
# is the one that does what you're asking for most directly:
|
||||||
for word in doc:
|
for word in doc:
|
||||||
if word.dep_ in ('xcomp', 'ccomp'):
|
if word.dep_ in ("xcomp", "ccomp"):
|
||||||
print(''.join(w.text_with_ws for w in word.subtree))
|
print("".join(w.text_with_ws for w in word.subtree))
|
||||||
|
|
||||||
# It'd probably be better for `word.subtree` to return a `Span` object
|
# It'd probably be better for `word.subtree` to return a `Span` object
|
||||||
# instead of a generator over the tokens. If you want the `Span` you can
|
# instead of a generator over the tokens. If you want the `Span` you can
|
||||||
# get it via the `.right_edge` and `.left_edge` properties. The `Span`
|
# get it via the `.right_edge` and `.left_edge` properties. The `Span`
|
||||||
# object is nice because you can easily get a vector, merge it, etc.
|
# object is nice because you can easily get a vector, merge it, etc.
|
||||||
for word in doc:
|
for word in doc:
|
||||||
if word.dep_ in ('xcomp', 'ccomp'):
|
if word.dep_ in ("xcomp", "ccomp"):
|
||||||
subtree_span = doc[word.left_edge.i : word.right_edge.i + 1]
|
subtree_span = doc[word.left_edge.i : word.right_edge.i + 1]
|
||||||
print(subtree_span.text, '|', subtree_span.root.text)
|
print(subtree_span.text, "|", subtree_span.root.text)
|
||||||
|
|
||||||
# You might also want to select a head, and then select a start and end
|
# You might also want to select a head, and then select a start and end
|
||||||
# position by walking along its children. You could then take the
|
# position by walking along its children. You could then take the
|
||||||
# `.left_edge` and `.right_edge` of those tokens, and use it to calculate
|
# `.left_edge` and `.right_edge` of those tokens, and use it to calculate
|
||||||
# a span.
|
# a span.
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
|
if __name__ == "__main__":
|
||||||
plac.call(main)
|
plac.call(main)
|
||||||
|
|
||||||
# Expected output:
|
# Expected output:
|
||||||
|
|
|
@ -45,7 +45,7 @@ from __future__ import print_function, unicode_literals, division
|
||||||
from bz2 import BZ2File
|
from bz2 import BZ2File
|
||||||
import time
|
import time
|
||||||
import plac
|
import plac
|
||||||
import ujson
|
import json
|
||||||
|
|
||||||
from spacy.matcher import PhraseMatcher
|
from spacy.matcher import PhraseMatcher
|
||||||
import spacy
|
import spacy
|
||||||
|
@ -55,15 +55,15 @@ import spacy
|
||||||
patterns_loc=("Path to gazetteer", "positional", None, str),
|
patterns_loc=("Path to gazetteer", "positional", None, str),
|
||||||
text_loc=("Path to Reddit corpus file", "positional", None, str),
|
text_loc=("Path to Reddit corpus file", "positional", None, str),
|
||||||
n=("Number of texts to read", "option", "n", int),
|
n=("Number of texts to read", "option", "n", int),
|
||||||
lang=("Language class to initialise", "option", "l", str))
|
lang=("Language class to initialise", "option", "l", str),
|
||||||
def main(patterns_loc, text_loc, n=10000, lang='en'):
|
)
|
||||||
nlp = spacy.blank('en')
|
def main(patterns_loc, text_loc, n=10000, lang="en"):
|
||||||
|
nlp = spacy.blank("en")
|
||||||
nlp.vocab.lex_attr_getters = {}
|
nlp.vocab.lex_attr_getters = {}
|
||||||
phrases = read_gazetteer(nlp.tokenizer, patterns_loc)
|
phrases = read_gazetteer(nlp.tokenizer, patterns_loc)
|
||||||
count = 0
|
count = 0
|
||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
for ent_id, text in get_matches(nlp.tokenizer, phrases,
|
for ent_id, text in get_matches(nlp.tokenizer, phrases, read_text(text_loc, n=n)):
|
||||||
read_text(text_loc, n=n)):
|
|
||||||
count += 1
|
count += 1
|
||||||
t2 = time.time()
|
t2 = time.time()
|
||||||
print("%d docs in %.3f s. %d matches" % (n, (t2 - t1), count))
|
print("%d docs in %.3f s. %d matches" % (n, (t2 - t1), count))
|
||||||
|
@ -71,8 +71,8 @@ def main(patterns_loc, text_loc, n=10000, lang='en'):
|
||||||
|
|
||||||
def read_gazetteer(tokenizer, loc, n=-1):
|
def read_gazetteer(tokenizer, loc, n=-1):
|
||||||
for i, line in enumerate(open(loc)):
|
for i, line in enumerate(open(loc)):
|
||||||
data = ujson.loads(line.strip())
|
data = json.loads(line.strip())
|
||||||
phrase = tokenizer(data['text'])
|
phrase = tokenizer(data["text"])
|
||||||
for w in phrase:
|
for w in phrase:
|
||||||
_ = tokenizer.vocab[w.text]
|
_ = tokenizer.vocab[w.text]
|
||||||
if len(phrase) >= 2:
|
if len(phrase) >= 2:
|
||||||
|
@ -82,15 +82,15 @@ def read_gazetteer(tokenizer, loc, n=-1):
|
||||||
def read_text(bz2_loc, n=10000):
|
def read_text(bz2_loc, n=10000):
|
||||||
with BZ2File(bz2_loc) as file_:
|
with BZ2File(bz2_loc) as file_:
|
||||||
for i, line in enumerate(file_):
|
for i, line in enumerate(file_):
|
||||||
data = ujson.loads(line)
|
data = json.loads(line)
|
||||||
yield data['body']
|
yield data["body"]
|
||||||
if i >= n:
|
if i >= n:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
def get_matches(tokenizer, phrases, texts, max_length=6):
|
def get_matches(tokenizer, phrases, texts, max_length=6):
|
||||||
matcher = PhraseMatcher(tokenizer.vocab, max_length=max_length)
|
matcher = PhraseMatcher(tokenizer.vocab, max_length=max_length)
|
||||||
matcher.add('Phrase', None, *phrases)
|
matcher.add("Phrase", None, *phrases)
|
||||||
for text in texts:
|
for text in texts:
|
||||||
doc = tokenizer(text)
|
doc = tokenizer(text)
|
||||||
for w in doc:
|
for w in doc:
|
||||||
|
@ -100,10 +100,11 @@ def get_matches(tokenizer, phrases, texts, max_length=6):
|
||||||
yield (ent_id, doc[start:end].text)
|
yield (ent_id, doc[start:end].text)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
if False:
|
if False:
|
||||||
import cProfile
|
import cProfile
|
||||||
import pstats
|
import pstats
|
||||||
|
|
||||||
cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof")
|
cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof")
|
||||||
s = pstats.Stats("Profile.prof")
|
s = pstats.Stats("Profile.prof")
|
||||||
s.strip_dirs().sort_stats("time").print_stats()
|
s.strip_dirs().sort_stats("time").print_stats()
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import ujson as json
|
import json
|
||||||
from keras.utils import to_categorical
|
from keras.utils import to_categorical
|
||||||
import plac
|
import plac
|
||||||
import sys
|
import sys
|
||||||
|
@ -32,7 +32,7 @@ def set_keras_backend(backend):
|
||||||
K.set_session(K.tf.Session(config=cfg))
|
K.set_session(K.tf.Session(config=cfg))
|
||||||
K.clear_session()
|
K.clear_session()
|
||||||
|
|
||||||
set_keras_backend("tensorflow")
|
set_keras_backend("tensorflow")
|
||||||
|
|
||||||
|
|
||||||
def train(train_loc, dev_loc, shape, settings):
|
def train(train_loc, dev_loc, shape, settings):
|
||||||
|
@ -42,7 +42,7 @@ def train(train_loc, dev_loc, shape, settings):
|
||||||
print("Loading spaCy")
|
print("Loading spaCy")
|
||||||
nlp = spacy.load('en_vectors_web_lg')
|
nlp = spacy.load('en_vectors_web_lg')
|
||||||
assert nlp.path is not None
|
assert nlp.path is not None
|
||||||
|
|
||||||
print("Processing texts...")
|
print("Processing texts...")
|
||||||
train_X = create_dataset(nlp, train_texts1, train_texts2, 100, shape[0])
|
train_X = create_dataset(nlp, train_texts1, train_texts2, 100, shape[0])
|
||||||
dev_X = create_dataset(nlp, dev_texts1, dev_texts2, 100, shape[0])
|
dev_X = create_dataset(nlp, dev_texts1, dev_texts2, 100, shape[0])
|
||||||
|
@ -57,7 +57,7 @@ def train(train_loc, dev_loc, shape, settings):
|
||||||
validation_data = (dev_X, dev_labels),
|
validation_data = (dev_X, dev_labels),
|
||||||
epochs = settings['nr_epoch'],
|
epochs = settings['nr_epoch'],
|
||||||
batch_size = settings['batch_size'])
|
batch_size = settings['batch_size'])
|
||||||
|
|
||||||
if not (nlp.path / 'similarity').exists():
|
if not (nlp.path / 'similarity').exists():
|
||||||
(nlp.path / 'similarity').mkdir()
|
(nlp.path / 'similarity').mkdir()
|
||||||
print("Saving to", nlp.path / 'similarity')
|
print("Saving to", nlp.path / 'similarity')
|
||||||
|
@ -74,7 +74,7 @@ def evaluate(dev_loc, shape):
|
||||||
dev_texts1, dev_texts2, dev_labels = read_snli(dev_loc)
|
dev_texts1, dev_texts2, dev_labels = read_snli(dev_loc)
|
||||||
nlp = spacy.load('en_vectors_web_lg')
|
nlp = spacy.load('en_vectors_web_lg')
|
||||||
nlp.add_pipe(KerasSimilarityShim.load(nlp.path / 'similarity', nlp, shape[0]))
|
nlp.add_pipe(KerasSimilarityShim.load(nlp.path / 'similarity', nlp, shape[0]))
|
||||||
|
|
||||||
total = 0.
|
total = 0.
|
||||||
correct = 0.
|
correct = 0.
|
||||||
for text1, text2, label in zip(dev_texts1, dev_texts2, dev_labels):
|
for text1, text2, label in zip(dev_texts1, dev_texts2, dev_labels):
|
||||||
|
@ -119,33 +119,33 @@ def read_snli(path):
|
||||||
|
|
||||||
def create_dataset(nlp, texts, hypotheses, num_unk, max_length):
|
def create_dataset(nlp, texts, hypotheses, num_unk, max_length):
|
||||||
sents = texts + hypotheses
|
sents = texts + hypotheses
|
||||||
|
|
||||||
sents_as_ids = []
|
sents_as_ids = []
|
||||||
for sent in sents:
|
for sent in sents:
|
||||||
doc = nlp(sent)
|
doc = nlp(sent)
|
||||||
word_ids = []
|
word_ids = []
|
||||||
|
|
||||||
for i, token in enumerate(doc):
|
for i, token in enumerate(doc):
|
||||||
# skip odd spaces from tokenizer
|
# skip odd spaces from tokenizer
|
||||||
if token.has_vector and token.vector_norm == 0:
|
if token.has_vector and token.vector_norm == 0:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if i > max_length:
|
if i > max_length:
|
||||||
break
|
break
|
||||||
|
|
||||||
if token.has_vector:
|
if token.has_vector:
|
||||||
word_ids.append(token.rank + num_unk + 1)
|
word_ids.append(token.rank + num_unk + 1)
|
||||||
else:
|
else:
|
||||||
# if we don't have a vector, pick an OOV entry
|
# if we don't have a vector, pick an OOV entry
|
||||||
word_ids.append(token.rank % num_unk + 1)
|
word_ids.append(token.rank % num_unk + 1)
|
||||||
|
|
||||||
# there must be a simpler way of generating padded arrays from lists...
|
# there must be a simpler way of generating padded arrays from lists...
|
||||||
word_id_vec = np.zeros((max_length), dtype='int')
|
word_id_vec = np.zeros((max_length), dtype='int')
|
||||||
clipped_len = min(max_length, len(word_ids))
|
clipped_len = min(max_length, len(word_ids))
|
||||||
word_id_vec[:clipped_len] = word_ids[:clipped_len]
|
word_id_vec[:clipped_len] = word_ids[:clipped_len]
|
||||||
sents_as_ids.append(word_id_vec)
|
sents_as_ids.append(word_id_vec)
|
||||||
|
|
||||||
|
|
||||||
return [np.array(sents_as_ids[:len(texts)]), np.array(sents_as_ids[len(texts):])]
|
return [np.array(sents_as_ids[:len(texts)]), np.array(sents_as_ids[len(texts):])]
|
||||||
|
|
||||||
|
|
||||||
|
@ -169,7 +169,7 @@ def main(mode, train_loc, dev_loc,
|
||||||
batch_size = 1024,
|
batch_size = 1024,
|
||||||
nr_epoch = 10,
|
nr_epoch = 10,
|
||||||
entail_dir="both"):
|
entail_dir="both"):
|
||||||
|
|
||||||
shape = (max_length, nr_hidden, 3)
|
shape = (max_length, nr_hidden, 3)
|
||||||
settings = {
|
settings = {
|
||||||
'lr': learn_rate,
|
'lr': learn_rate,
|
||||||
|
|
|
@ -10,19 +10,19 @@ def build_model(vectors, shape, settings):
|
||||||
|
|
||||||
input1 = layers.Input(shape=(max_length,), dtype='int32', name='words1')
|
input1 = layers.Input(shape=(max_length,), dtype='int32', name='words1')
|
||||||
input2 = layers.Input(shape=(max_length,), dtype='int32', name='words2')
|
input2 = layers.Input(shape=(max_length,), dtype='int32', name='words2')
|
||||||
|
|
||||||
# embeddings (projected)
|
# embeddings (projected)
|
||||||
embed = create_embedding(vectors, max_length, nr_hidden)
|
embed = create_embedding(vectors, max_length, nr_hidden)
|
||||||
|
|
||||||
a = embed(input1)
|
a = embed(input1)
|
||||||
b = embed(input2)
|
b = embed(input2)
|
||||||
|
|
||||||
# step 1: attend
|
# step 1: attend
|
||||||
F = create_feedforward(nr_hidden)
|
F = create_feedforward(nr_hidden)
|
||||||
att_weights = layers.dot([F(a), F(b)], axes=-1)
|
att_weights = layers.dot([F(a), F(b)], axes=-1)
|
||||||
|
|
||||||
G = create_feedforward(nr_hidden)
|
G = create_feedforward(nr_hidden)
|
||||||
|
|
||||||
if settings['entail_dir'] == 'both':
|
if settings['entail_dir'] == 'both':
|
||||||
norm_weights_a = layers.Lambda(normalizer(1))(att_weights)
|
norm_weights_a = layers.Lambda(normalizer(1))(att_weights)
|
||||||
norm_weights_b = layers.Lambda(normalizer(2))(att_weights)
|
norm_weights_b = layers.Lambda(normalizer(2))(att_weights)
|
||||||
|
@ -55,18 +55,18 @@ def build_model(vectors, shape, settings):
|
||||||
v1 = layers.TimeDistributed(G)(comp1)
|
v1 = layers.TimeDistributed(G)(comp1)
|
||||||
v1_sum = layers.Lambda(sum_word)(v1)
|
v1_sum = layers.Lambda(sum_word)(v1)
|
||||||
concat = v1_sum
|
concat = v1_sum
|
||||||
|
|
||||||
H = create_feedforward(nr_hidden)
|
H = create_feedforward(nr_hidden)
|
||||||
out = H(concat)
|
out = H(concat)
|
||||||
out = layers.Dense(nr_class, activation='softmax')(out)
|
out = layers.Dense(nr_class, activation='softmax')(out)
|
||||||
|
|
||||||
model = Model([input1, input2], out)
|
model = Model([input1, input2], out)
|
||||||
|
|
||||||
model.compile(
|
model.compile(
|
||||||
optimizer=optimizers.Adam(lr=settings['lr']),
|
optimizer=optimizers.Adam(lr=settings['lr']),
|
||||||
loss='categorical_crossentropy',
|
loss='categorical_crossentropy',
|
||||||
metrics=['accuracy'])
|
metrics=['accuracy'])
|
||||||
|
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
@ -78,7 +78,7 @@ def create_embedding(vectors, max_length, projected_dim):
|
||||||
input_length=max_length,
|
input_length=max_length,
|
||||||
weights=[vectors],
|
weights=[vectors],
|
||||||
trainable=False),
|
trainable=False),
|
||||||
|
|
||||||
layers.TimeDistributed(
|
layers.TimeDistributed(
|
||||||
layers.Dense(projected_dim,
|
layers.Dense(projected_dim,
|
||||||
activation=None,
|
activation=None,
|
||||||
|
|
|
@ -77,7 +77,7 @@
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"import ujson as json\n",
|
"import json\n",
|
||||||
"from keras.utils import to_categorical\n",
|
"from keras.utils import to_categorical\n",
|
||||||
"\n",
|
"\n",
|
||||||
"LABELS = {'entailment': 0, 'contradiction': 1, 'neutral': 2}\n",
|
"LABELS = {'entailment': 0, 'contradiction': 1, 'neutral': 2}\n",
|
||||||
|
|
|
@ -19,39 +19,40 @@ from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
output_dir=("Output directory for saved HTML", "positional", None, Path))
|
output_dir=("Output directory for saved HTML", "positional", None, Path)
|
||||||
|
)
|
||||||
def main(output_dir=None):
|
def main(output_dir=None):
|
||||||
nlp = English() # start off with blank English class
|
nlp = English() # start off with blank English class
|
||||||
|
|
||||||
Doc.set_extension('overlap', method=overlap_tokens)
|
Doc.set_extension("overlap", method=overlap_tokens)
|
||||||
doc1 = nlp(u"Peach emoji is where it has always been.")
|
doc1 = nlp("Peach emoji is where it has always been.")
|
||||||
doc2 = nlp(u"Peach is the superior emoji.")
|
doc2 = nlp("Peach is the superior emoji.")
|
||||||
print("Text 1:", doc1.text)
|
print("Text 1:", doc1.text)
|
||||||
print("Text 2:", doc2.text)
|
print("Text 2:", doc2.text)
|
||||||
print("Overlapping tokens:", doc1._.overlap(doc2))
|
print("Overlapping tokens:", doc1._.overlap(doc2))
|
||||||
|
|
||||||
Doc.set_extension('to_html', method=to_html)
|
Doc.set_extension("to_html", method=to_html)
|
||||||
doc = nlp(u"This is a sentence about Apple.")
|
doc = nlp("This is a sentence about Apple.")
|
||||||
# add entity manually for demo purposes, to make it work without a model
|
# add entity manually for demo purposes, to make it work without a model
|
||||||
doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])]
|
doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings["ORG"])]
|
||||||
print("Text:", doc.text)
|
print("Text:", doc.text)
|
||||||
doc._.to_html(output=output_dir, style='ent')
|
doc._.to_html(output=output_dir, style="ent")
|
||||||
|
|
||||||
|
|
||||||
def to_html(doc, output='/tmp', style='dep'):
|
def to_html(doc, output="/tmp", style="dep"):
|
||||||
"""Doc method extension for saving the current state as a displaCy
|
"""Doc method extension for saving the current state as a displaCy
|
||||||
visualization.
|
visualization.
|
||||||
"""
|
"""
|
||||||
# generate filename from first six non-punct tokens
|
# generate filename from first six non-punct tokens
|
||||||
file_name = '-'.join([w.text for w in doc[:6] if not w.is_punct]) + '.html'
|
file_name = "-".join([w.text for w in doc[:6] if not w.is_punct]) + ".html"
|
||||||
html = displacy.render(doc, style=style, page=True) # render markup
|
html = displacy.render(doc, style=style, page=True) # render markup
|
||||||
if output is not None:
|
if output is not None:
|
||||||
output_path = Path(output)
|
output_path = Path(output)
|
||||||
if not output_path.exists():
|
if not output_path.exists():
|
||||||
output_path.mkdir()
|
output_path.mkdir()
|
||||||
output_file = Path(output) / file_name
|
output_file = Path(output) / file_name
|
||||||
output_file.open('w', encoding='utf-8').write(html) # save to file
|
output_file.open("w", encoding="utf-8").write(html) # save to file
|
||||||
print('Saved HTML to {}'.format(output_file))
|
print("Saved HTML to {}".format(output_file))
|
||||||
else:
|
else:
|
||||||
print(html)
|
print(html)
|
||||||
|
|
||||||
|
@ -67,7 +68,7 @@ def overlap_tokens(doc, other_doc):
|
||||||
return overlap
|
return overlap
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
plac.call(main)
|
plac.call(main)
|
||||||
|
|
||||||
# Expected output:
|
# Expected output:
|
||||||
|
|
|
@ -25,15 +25,19 @@ def main():
|
||||||
# and no model or pre-defined pipeline loaded.
|
# and no model or pre-defined pipeline loaded.
|
||||||
nlp = English()
|
nlp = English()
|
||||||
rest_countries = RESTCountriesComponent(nlp) # initialise component
|
rest_countries = RESTCountriesComponent(nlp) # initialise component
|
||||||
nlp.add_pipe(rest_countries) # add it to the pipeline
|
nlp.add_pipe(rest_countries) # add it to the pipeline
|
||||||
doc = nlp(u"Some text about Colombia and the Czech Republic")
|
doc = nlp("Some text about Colombia and the Czech Republic")
|
||||||
print('Pipeline', nlp.pipe_names) # pipeline contains component name
|
print("Pipeline", nlp.pipe_names) # pipeline contains component name
|
||||||
print('Doc has countries', doc._.has_country) # Doc contains countries
|
print("Doc has countries", doc._.has_country) # Doc contains countries
|
||||||
for token in doc:
|
for token in doc:
|
||||||
if token._.is_country:
|
if token._.is_country:
|
||||||
print(token.text, token._.country_capital, token._.country_latlng,
|
print(
|
||||||
token._.country_flag) # country data
|
token.text,
|
||||||
print('Entities', [(e.text, e.label_) for e in doc.ents]) # entities
|
token._.country_capital,
|
||||||
|
token._.country_latlng,
|
||||||
|
token._.country_flag,
|
||||||
|
) # country data
|
||||||
|
print("Entities", [(e.text, e.label_) for e in doc.ents]) # entities
|
||||||
|
|
||||||
|
|
||||||
class RESTCountriesComponent(object):
|
class RESTCountriesComponent(object):
|
||||||
|
@ -41,42 +45,42 @@ class RESTCountriesComponent(object):
|
||||||
the REST Countries API, merges country names into one token, assigns entity
|
the REST Countries API, merges country names into one token, assigns entity
|
||||||
labels and sets attributes on country tokens.
|
labels and sets attributes on country tokens.
|
||||||
"""
|
"""
|
||||||
name = 'rest_countries' # component name, will show up in the pipeline
|
|
||||||
|
|
||||||
def __init__(self, nlp, label='GPE'):
|
name = "rest_countries" # component name, will show up in the pipeline
|
||||||
|
|
||||||
|
def __init__(self, nlp, label="GPE"):
|
||||||
"""Initialise the pipeline component. The shared nlp instance is used
|
"""Initialise the pipeline component. The shared nlp instance is used
|
||||||
to initialise the matcher with the shared vocab, get the label ID and
|
to initialise the matcher with the shared vocab, get the label ID and
|
||||||
generate Doc objects as phrase match patterns.
|
generate Doc objects as phrase match patterns.
|
||||||
"""
|
"""
|
||||||
# Make request once on initialisation and store the data
|
# Make request once on initialisation and store the data
|
||||||
r = requests.get('https://restcountries.eu/rest/v2/all')
|
r = requests.get("https://restcountries.eu/rest/v2/all")
|
||||||
r.raise_for_status() # make sure requests raises an error if it fails
|
r.raise_for_status() # make sure requests raises an error if it fails
|
||||||
countries = r.json()
|
countries = r.json()
|
||||||
|
|
||||||
# Convert API response to dict keyed by country name for easy lookup
|
# Convert API response to dict keyed by country name for easy lookup
|
||||||
# This could also be extended using the alternative and foreign language
|
# This could also be extended using the alternative and foreign language
|
||||||
# names provided by the API
|
# names provided by the API
|
||||||
self.countries = {c['name']: c for c in countries}
|
self.countries = {c["name"]: c for c in countries}
|
||||||
self.label = nlp.vocab.strings[label] # get entity label ID
|
self.label = nlp.vocab.strings[label] # get entity label ID
|
||||||
|
|
||||||
# Set up the PhraseMatcher with Doc patterns for each country name
|
# Set up the PhraseMatcher with Doc patterns for each country name
|
||||||
patterns = [nlp(c) for c in self.countries.keys()]
|
patterns = [nlp(c) for c in self.countries.keys()]
|
||||||
self.matcher = PhraseMatcher(nlp.vocab)
|
self.matcher = PhraseMatcher(nlp.vocab)
|
||||||
self.matcher.add('COUNTRIES', None, *patterns)
|
self.matcher.add("COUNTRIES", None, *patterns)
|
||||||
|
|
||||||
# Register attribute on the Token. We'll be overwriting this based on
|
# Register attribute on the Token. We'll be overwriting this based on
|
||||||
# the matches, so we're only setting a default value, not a getter.
|
# the matches, so we're only setting a default value, not a getter.
|
||||||
# If no default value is set, it defaults to None.
|
# If no default value is set, it defaults to None.
|
||||||
Token.set_extension('is_country', default=False)
|
Token.set_extension("is_country", default=False)
|
||||||
Token.set_extension('country_capital', default=False)
|
Token.set_extension("country_capital", default=False)
|
||||||
Token.set_extension('country_latlng', default=False)
|
Token.set_extension("country_latlng", default=False)
|
||||||
Token.set_extension('country_flag', default=False)
|
Token.set_extension("country_flag", default=False)
|
||||||
|
|
||||||
# Register attributes on Doc and Span via a getter that checks if one of
|
# Register attributes on Doc and Span via a getter that checks if one of
|
||||||
# the contained tokens is set to is_country == True.
|
# the contained tokens is set to is_country == True.
|
||||||
Doc.set_extension('has_country', getter=self.has_country)
|
Doc.set_extension("has_country", getter=self.has_country)
|
||||||
Span.set_extension('has_country', getter=self.has_country)
|
Span.set_extension("has_country", getter=self.has_country)
|
||||||
|
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
"""Apply the pipeline component on a Doc object and modify it if matches
|
"""Apply the pipeline component on a Doc object and modify it if matches
|
||||||
|
@ -93,10 +97,10 @@ class RESTCountriesComponent(object):
|
||||||
# Can be extended with other data returned by the API, like
|
# Can be extended with other data returned by the API, like
|
||||||
# currencies, country code, flag, calling code etc.
|
# currencies, country code, flag, calling code etc.
|
||||||
for token in entity:
|
for token in entity:
|
||||||
token._.set('is_country', True)
|
token._.set("is_country", True)
|
||||||
token._.set('country_capital', self.countries[entity.text]['capital'])
|
token._.set("country_capital", self.countries[entity.text]["capital"])
|
||||||
token._.set('country_latlng', self.countries[entity.text]['latlng'])
|
token._.set("country_latlng", self.countries[entity.text]["latlng"])
|
||||||
token._.set('country_flag', self.countries[entity.text]['flag'])
|
token._.set("country_flag", self.countries[entity.text]["flag"])
|
||||||
# Overwrite doc.ents and add entity – be careful not to replace!
|
# Overwrite doc.ents and add entity – be careful not to replace!
|
||||||
doc.ents = list(doc.ents) + [entity]
|
doc.ents = list(doc.ents) + [entity]
|
||||||
for span in spans:
|
for span in spans:
|
||||||
|
@ -111,10 +115,10 @@ class RESTCountriesComponent(object):
|
||||||
is a country. Since the getter is only called when we access the
|
is a country. Since the getter is only called when we access the
|
||||||
attribute, we can refer to the Token's 'is_country' attribute here,
|
attribute, we can refer to the Token's 'is_country' attribute here,
|
||||||
which is already set in the processing step."""
|
which is already set in the processing step."""
|
||||||
return any([t._.get('is_country') for t in tokens])
|
return any([t._.get("is_country") for t in tokens])
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
plac.call(main)
|
plac.call(main)
|
||||||
|
|
||||||
# Expected output:
|
# Expected output:
|
||||||
|
|
|
@ -20,23 +20,24 @@ from spacy.tokens import Doc, Span, Token
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
text=("Text to process", "positional", None, str),
|
text=("Text to process", "positional", None, str),
|
||||||
companies=("Names of technology companies", "positional", None, str))
|
companies=("Names of technology companies", "positional", None, str),
|
||||||
|
)
|
||||||
def main(text="Alphabet Inc. is the company behind Google.", *companies):
|
def main(text="Alphabet Inc. is the company behind Google.", *companies):
|
||||||
# For simplicity, we start off with only the blank English Language class
|
# For simplicity, we start off with only the blank English Language class
|
||||||
# and no model or pre-defined pipeline loaded.
|
# and no model or pre-defined pipeline loaded.
|
||||||
nlp = English()
|
nlp = English()
|
||||||
if not companies: # set default companies if none are set via args
|
if not companies: # set default companies if none are set via args
|
||||||
companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple'] # etc.
|
companies = ["Alphabet Inc.", "Google", "Netflix", "Apple"] # etc.
|
||||||
component = TechCompanyRecognizer(nlp, companies) # initialise component
|
component = TechCompanyRecognizer(nlp, companies) # initialise component
|
||||||
nlp.add_pipe(component, last=True) # add last to the pipeline
|
nlp.add_pipe(component, last=True) # add last to the pipeline
|
||||||
|
|
||||||
doc = nlp(text)
|
doc = nlp(text)
|
||||||
print('Pipeline', nlp.pipe_names) # pipeline contains component name
|
print("Pipeline", nlp.pipe_names) # pipeline contains component name
|
||||||
print('Tokens', [t.text for t in doc]) # company names from the list are merged
|
print("Tokens", [t.text for t in doc]) # company names from the list are merged
|
||||||
print('Doc has_tech_org', doc._.has_tech_org) # Doc contains tech orgs
|
print("Doc has_tech_org", doc._.has_tech_org) # Doc contains tech orgs
|
||||||
print('Token 0 is_tech_org', doc[0]._.is_tech_org) # "Alphabet Inc." is a tech org
|
print("Token 0 is_tech_org", doc[0]._.is_tech_org) # "Alphabet Inc." is a tech org
|
||||||
print('Token 1 is_tech_org', doc[1]._.is_tech_org) # "is" is not
|
print("Token 1 is_tech_org", doc[1]._.is_tech_org) # "is" is not
|
||||||
print('Entities', [(e.text, e.label_) for e in doc.ents]) # all orgs are entities
|
print("Entities", [(e.text, e.label_) for e in doc.ents]) # all orgs are entities
|
||||||
|
|
||||||
|
|
||||||
class TechCompanyRecognizer(object):
|
class TechCompanyRecognizer(object):
|
||||||
|
@ -45,9 +46,10 @@ class TechCompanyRecognizer(object):
|
||||||
labelled as ORG and their spans are merged into one token. Additionally,
|
labelled as ORG and their spans are merged into one token. Additionally,
|
||||||
._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token
|
._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token
|
||||||
respectively."""
|
respectively."""
|
||||||
name = 'tech_companies' # component name, will show up in the pipeline
|
|
||||||
|
|
||||||
def __init__(self, nlp, companies=tuple(), label='ORG'):
|
name = "tech_companies" # component name, will show up in the pipeline
|
||||||
|
|
||||||
|
def __init__(self, nlp, companies=tuple(), label="ORG"):
|
||||||
"""Initialise the pipeline component. The shared nlp instance is used
|
"""Initialise the pipeline component. The shared nlp instance is used
|
||||||
to initialise the matcher with the shared vocab, get the label ID and
|
to initialise the matcher with the shared vocab, get the label ID and
|
||||||
generate Doc objects as phrase match patterns.
|
generate Doc objects as phrase match patterns.
|
||||||
|
@ -58,16 +60,16 @@ class TechCompanyRecognizer(object):
|
||||||
# so even if the list of companies is long, it's very efficient
|
# so even if the list of companies is long, it's very efficient
|
||||||
patterns = [nlp(org) for org in companies]
|
patterns = [nlp(org) for org in companies]
|
||||||
self.matcher = PhraseMatcher(nlp.vocab)
|
self.matcher = PhraseMatcher(nlp.vocab)
|
||||||
self.matcher.add('TECH_ORGS', None, *patterns)
|
self.matcher.add("TECH_ORGS", None, *patterns)
|
||||||
|
|
||||||
# Register attribute on the Token. We'll be overwriting this based on
|
# Register attribute on the Token. We'll be overwriting this based on
|
||||||
# the matches, so we're only setting a default value, not a getter.
|
# the matches, so we're only setting a default value, not a getter.
|
||||||
Token.set_extension('is_tech_org', default=False)
|
Token.set_extension("is_tech_org", default=False)
|
||||||
|
|
||||||
# Register attributes on Doc and Span via a getter that checks if one of
|
# Register attributes on Doc and Span via a getter that checks if one of
|
||||||
# the contained tokens is set to is_tech_org == True.
|
# the contained tokens is set to is_tech_org == True.
|
||||||
Doc.set_extension('has_tech_org', getter=self.has_tech_org)
|
Doc.set_extension("has_tech_org", getter=self.has_tech_org)
|
||||||
Span.set_extension('has_tech_org', getter=self.has_tech_org)
|
Span.set_extension("has_tech_org", getter=self.has_tech_org)
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
"""Apply the pipeline component on a Doc object and modify it if matches
|
"""Apply the pipeline component on a Doc object and modify it if matches
|
||||||
|
@ -82,7 +84,7 @@ class TechCompanyRecognizer(object):
|
||||||
spans.append(entity)
|
spans.append(entity)
|
||||||
# Set custom attribute on each token of the entity
|
# Set custom attribute on each token of the entity
|
||||||
for token in entity:
|
for token in entity:
|
||||||
token._.set('is_tech_org', True)
|
token._.set("is_tech_org", True)
|
||||||
# Overwrite doc.ents and add entity – be careful not to replace!
|
# Overwrite doc.ents and add entity – be careful not to replace!
|
||||||
doc.ents = list(doc.ents) + [entity]
|
doc.ents = list(doc.ents) + [entity]
|
||||||
for span in spans:
|
for span in spans:
|
||||||
|
@ -97,10 +99,10 @@ class TechCompanyRecognizer(object):
|
||||||
is a tech org. Since the getter is only called when we access the
|
is a tech org. Since the getter is only called when we access the
|
||||||
attribute, we can refer to the Token's 'is_tech_org' attribute here,
|
attribute, we can refer to the Token's 'is_tech_org' attribute here,
|
||||||
which is already set in the processing step."""
|
which is already set in the processing step."""
|
||||||
return any([t._.get('is_tech_org') for t in tokens])
|
return any([t._.get("is_tech_org") for t in tokens])
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
plac.call(main)
|
plac.call(main)
|
||||||
|
|
||||||
# Expected output:
|
# Expected output:
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
'''Example of adding a pipeline component to prohibit sentence boundaries
|
"""Example of adding a pipeline component to prohibit sentence boundaries
|
||||||
before certain tokens.
|
before certain tokens.
|
||||||
|
|
||||||
What we do is write to the token.is_sent_start attribute, which
|
What we do is write to the token.is_sent_start attribute, which
|
||||||
|
@ -10,16 +10,18 @@ should also improve the parse quality.
|
||||||
The specific example here is drawn from https://github.com/explosion/spaCy/issues/2627
|
The specific example here is drawn from https://github.com/explosion/spaCy/issues/2627
|
||||||
Other versions of the model may not make the original mistake, so the specific
|
Other versions of the model may not make the original mistake, so the specific
|
||||||
example might not be apt for future versions.
|
example might not be apt for future versions.
|
||||||
'''
|
"""
|
||||||
import plac
|
import plac
|
||||||
import spacy
|
import spacy
|
||||||
|
|
||||||
|
|
||||||
def prevent_sentence_boundaries(doc):
|
def prevent_sentence_boundaries(doc):
|
||||||
for token in doc:
|
for token in doc:
|
||||||
if not can_be_sentence_start(token):
|
if not can_be_sentence_start(token):
|
||||||
token.is_sent_start = False
|
token.is_sent_start = False
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
||||||
def can_be_sentence_start(token):
|
def can_be_sentence_start(token):
|
||||||
if token.i == 0:
|
if token.i == 0:
|
||||||
return True
|
return True
|
||||||
|
@ -32,17 +34,18 @@ def can_be_sentence_start(token):
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
nlp = spacy.load('en_core_web_lg')
|
nlp = spacy.load("en_core_web_lg")
|
||||||
raw_text = "Been here and I'm loving it."
|
raw_text = "Been here and I'm loving it."
|
||||||
doc = nlp(raw_text)
|
doc = nlp(raw_text)
|
||||||
sentences = [sent.string.strip() for sent in doc.sents]
|
sentences = [sent.string.strip() for sent in doc.sents]
|
||||||
print(sentences)
|
print(sentences)
|
||||||
nlp.add_pipe(prevent_sentence_boundaries, before='parser')
|
nlp.add_pipe(prevent_sentence_boundaries, before="parser")
|
||||||
doc = nlp(raw_text)
|
doc = nlp(raw_text)
|
||||||
sentences = [sent.string.strip() for sent in doc.sents]
|
sentences = [sent.string.strip() for sent in doc.sents]
|
||||||
print(sentences)
|
print(sentences)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
plac.call(main)
|
plac.call(main)
|
||||||
|
|
|
@ -1,10 +1,11 @@
|
||||||
'''Demonstrate adding a rule-based component that forces some tokens to not
|
"""Demonstrate adding a rule-based component that forces some tokens to not
|
||||||
be entities, before the NER tagger is applied. This is used to hotfix the issue
|
be entities, before the NER tagger is applied. This is used to hotfix the issue
|
||||||
in https://github.com/explosion/spaCy/issues/2870 , present as of spaCy v2.0.16.
|
in https://github.com/explosion/spaCy/issues/2870 , present as of spaCy v2.0.16.
|
||||||
'''
|
"""
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.attrs import ENT_IOB
|
from spacy.attrs import ENT_IOB
|
||||||
|
|
||||||
|
|
||||||
def fix_space_tags(doc):
|
def fix_space_tags(doc):
|
||||||
ent_iobs = doc.to_array([ENT_IOB])
|
ent_iobs = doc.to_array([ENT_IOB])
|
||||||
for i, token in enumerate(doc):
|
for i, token in enumerate(doc):
|
||||||
|
@ -14,14 +15,16 @@ def fix_space_tags(doc):
|
||||||
doc.from_array([ENT_IOB], ent_iobs.reshape((len(doc), 1)))
|
doc.from_array([ENT_IOB], ent_iobs.reshape((len(doc), 1)))
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def main():
|
|
||||||
nlp = spacy.load('en_core_web_sm')
|
|
||||||
text = u'''This is some crazy test where I dont need an Apple Watch to make things bug'''
|
|
||||||
doc = nlp(text)
|
|
||||||
print('Before', doc.ents)
|
|
||||||
nlp.add_pipe(fix_space_tags, name='fix-ner', before='ner')
|
|
||||||
doc = nlp(text)
|
|
||||||
print('After', doc.ents)
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
def main():
|
||||||
|
nlp = spacy.load("en_core_web_sm")
|
||||||
|
text = u"""This is some crazy test where I dont need an Apple Watch to make things bug"""
|
||||||
|
doc = nlp(text)
|
||||||
|
print("Before", doc.ents)
|
||||||
|
nlp.add_pipe(fix_space_tags, name="fix-ner", before="ner")
|
||||||
|
doc = nlp(text)
|
||||||
|
print("After", doc.ents)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
|
@ -9,6 +9,7 @@ built-in dataset loader.
|
||||||
Compatible with: spaCy v2.0.0+
|
Compatible with: spaCy v2.0.0+
|
||||||
"""
|
"""
|
||||||
from __future__ import print_function, unicode_literals
|
from __future__ import print_function, unicode_literals
|
||||||
|
|
||||||
from toolz import partition_all
|
from toolz import partition_all
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from joblib import Parallel, delayed
|
from joblib import Parallel, delayed
|
||||||
|
@ -22,9 +23,9 @@ import spacy
|
||||||
model=("Model name (needs tagger)", "positional", None, str),
|
model=("Model name (needs tagger)", "positional", None, str),
|
||||||
n_jobs=("Number of workers", "option", "n", int),
|
n_jobs=("Number of workers", "option", "n", int),
|
||||||
batch_size=("Batch-size for each process", "option", "b", int),
|
batch_size=("Batch-size for each process", "option", "b", int),
|
||||||
limit=("Limit of entries from the dataset", "option", "l", int))
|
limit=("Limit of entries from the dataset", "option", "l", int),
|
||||||
def main(output_dir, model='en_core_web_sm', n_jobs=4, batch_size=1000,
|
)
|
||||||
limit=10000):
|
def main(output_dir, model="en_core_web_sm", n_jobs=4, batch_size=1000, limit=10000):
|
||||||
nlp = spacy.load(model) # load spaCy model
|
nlp = spacy.load(model) # load spaCy model
|
||||||
print("Loaded model '%s'" % model)
|
print("Loaded model '%s'" % model)
|
||||||
if not output_dir.exists():
|
if not output_dir.exists():
|
||||||
|
@ -37,42 +38,44 @@ def main(output_dir, model='en_core_web_sm', n_jobs=4, batch_size=1000,
|
||||||
partitions = partition_all(batch_size, texts)
|
partitions = partition_all(batch_size, texts)
|
||||||
executor = Parallel(n_jobs=n_jobs)
|
executor = Parallel(n_jobs=n_jobs)
|
||||||
do = delayed(transform_texts)
|
do = delayed(transform_texts)
|
||||||
tasks = (do(nlp, i, batch, output_dir)
|
tasks = (do(nlp, i, batch, output_dir) for i, batch in enumerate(partitions))
|
||||||
for i, batch in enumerate(partitions))
|
|
||||||
executor(tasks)
|
executor(tasks)
|
||||||
|
|
||||||
|
|
||||||
def transform_texts(nlp, batch_id, texts, output_dir):
|
def transform_texts(nlp, batch_id, texts, output_dir):
|
||||||
print(nlp.pipe_names)
|
print(nlp.pipe_names)
|
||||||
out_path = Path(output_dir) / ('%d.txt' % batch_id)
|
out_path = Path(output_dir) / ("%d.txt" % batch_id)
|
||||||
if out_path.exists(): # return None in case same batch is called again
|
if out_path.exists(): # return None in case same batch is called again
|
||||||
return None
|
return None
|
||||||
print('Processing batch', batch_id)
|
print("Processing batch", batch_id)
|
||||||
with out_path.open('w', encoding='utf8') as f:
|
with out_path.open("w", encoding="utf8") as f:
|
||||||
for doc in nlp.pipe(texts):
|
for doc in nlp.pipe(texts):
|
||||||
f.write(' '.join(represent_word(w) for w in doc if not w.is_space))
|
f.write(" ".join(represent_word(w) for w in doc if not w.is_space))
|
||||||
f.write('\n')
|
f.write("\n")
|
||||||
print('Saved {} texts to {}.txt'.format(len(texts), batch_id))
|
print("Saved {} texts to {}.txt".format(len(texts), batch_id))
|
||||||
|
|
||||||
|
|
||||||
def represent_word(word):
|
def represent_word(word):
|
||||||
text = word.text
|
text = word.text
|
||||||
# True-case, i.e. try to normalize sentence-initial capitals.
|
# True-case, i.e. try to normalize sentence-initial capitals.
|
||||||
# Only do this if the lower-cased form is more probable.
|
# Only do this if the lower-cased form is more probable.
|
||||||
if text.istitle() and is_sent_begin(word) \
|
if (
|
||||||
and word.prob < word.doc.vocab[text.lower()].prob:
|
text.istitle()
|
||||||
|
and is_sent_begin(word)
|
||||||
|
and word.prob < word.doc.vocab[text.lower()].prob
|
||||||
|
):
|
||||||
text = text.lower()
|
text = text.lower()
|
||||||
return text + '|' + word.tag_
|
return text + "|" + word.tag_
|
||||||
|
|
||||||
|
|
||||||
def is_sent_begin(word):
|
def is_sent_begin(word):
|
||||||
if word.i == 0:
|
if word.i == 0:
|
||||||
return True
|
return True
|
||||||
elif word.i >= 2 and word.nbor(-1).text in ('.', '!', '?', '...'):
|
elif word.i >= 2 and word.nbor(-1).text in (".", "!", "?", "..."):
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
plac.call(main)
|
plac.call(main)
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
'''Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
|
"""Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
|
||||||
.conllu format for development data, allowing the official scorer to be used.
|
.conllu format for development data, allowing the official scorer to be used.
|
||||||
'''
|
"""
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
import plac
|
import plac
|
||||||
import tqdm
|
import tqdm
|
||||||
|
@ -22,7 +22,6 @@ from spacy.matcher import Matcher
|
||||||
import itertools
|
import itertools
|
||||||
import random
|
import random
|
||||||
import numpy.random
|
import numpy.random
|
||||||
import cytoolz
|
|
||||||
|
|
||||||
import conll17_ud_eval
|
import conll17_ud_eval
|
||||||
|
|
||||||
|
@ -35,6 +34,7 @@ spacy.lang.ja.Japanese.Defaults.use_janome = False
|
||||||
random.seed(0)
|
random.seed(0)
|
||||||
numpy.random.seed(0)
|
numpy.random.seed(0)
|
||||||
|
|
||||||
|
|
||||||
def minibatch_by_words(items, size=5000):
|
def minibatch_by_words(items, size=5000):
|
||||||
random.shuffle(items)
|
random.shuffle(items)
|
||||||
if isinstance(size, int):
|
if isinstance(size, int):
|
||||||
|
@ -59,21 +59,31 @@ def minibatch_by_words(items, size=5000):
|
||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
################
|
################
|
||||||
# Data reading #
|
# Data reading #
|
||||||
################
|
################
|
||||||
|
|
||||||
space_re = re.compile('\s+')
|
space_re = re.compile("\s+")
|
||||||
def split_text(text):
|
|
||||||
return [space_re.sub(' ', par.strip()) for par in text.split('\n\n')]
|
|
||||||
|
|
||||||
|
|
||||||
def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False,
|
|
||||||
max_doc_length=None, limit=None):
|
def split_text(text):
|
||||||
'''Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
|
return [space_re.sub(" ", par.strip()) for par in text.split("\n\n")]
|
||||||
|
|
||||||
|
|
||||||
|
def read_data(
|
||||||
|
nlp,
|
||||||
|
conllu_file,
|
||||||
|
text_file,
|
||||||
|
raw_text=True,
|
||||||
|
oracle_segments=False,
|
||||||
|
max_doc_length=None,
|
||||||
|
limit=None,
|
||||||
|
):
|
||||||
|
"""Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
|
||||||
include Doc objects created using nlp.make_doc and then aligned against
|
include Doc objects created using nlp.make_doc and then aligned against
|
||||||
the gold-standard sequences. If oracle_segments=True, include Doc objects
|
the gold-standard sequences. If oracle_segments=True, include Doc objects
|
||||||
created from the gold-standard segments. At least one must be True.'''
|
created from the gold-standard segments. At least one must be True."""
|
||||||
if not raw_text and not oracle_segments:
|
if not raw_text and not oracle_segments:
|
||||||
raise ValueError("At least one of raw_text or oracle_segments must be True")
|
raise ValueError("At least one of raw_text or oracle_segments must be True")
|
||||||
paragraphs = split_text(text_file.read())
|
paragraphs = split_text(text_file.read())
|
||||||
|
@ -87,22 +97,21 @@ def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False,
|
||||||
for cs in cd:
|
for cs in cd:
|
||||||
sent = defaultdict(list)
|
sent = defaultdict(list)
|
||||||
for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs:
|
for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs:
|
||||||
if '.' in id_:
|
if "." in id_:
|
||||||
continue
|
continue
|
||||||
if '-' in id_:
|
if "-" in id_:
|
||||||
continue
|
continue
|
||||||
id_ = int(id_)-1
|
id_ = int(id_) - 1
|
||||||
head = int(head)-1 if head != '0' else id_
|
head = int(head) - 1 if head != "0" else id_
|
||||||
sent['words'].append(word)
|
sent["words"].append(word)
|
||||||
sent['tags'].append(tag)
|
sent["tags"].append(tag)
|
||||||
sent['heads'].append(head)
|
sent["heads"].append(head)
|
||||||
sent['deps'].append('ROOT' if dep == 'root' else dep)
|
sent["deps"].append("ROOT" if dep == "root" else dep)
|
||||||
sent['spaces'].append(space_after == '_')
|
sent["spaces"].append(space_after == "_")
|
||||||
sent['entities'] = ['-'] * len(sent['words'])
|
sent["entities"] = ["-"] * len(sent["words"])
|
||||||
sent['heads'], sent['deps'] = projectivize(sent['heads'],
|
sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"])
|
||||||
sent['deps'])
|
|
||||||
if oracle_segments:
|
if oracle_segments:
|
||||||
docs.append(Doc(nlp.vocab, words=sent['words'], spaces=sent['spaces']))
|
docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
|
||||||
golds.append(GoldParse(docs[-1], **sent))
|
golds.append(GoldParse(docs[-1], **sent))
|
||||||
|
|
||||||
sent_annots.append(sent)
|
sent_annots.append(sent)
|
||||||
|
@ -128,18 +137,18 @@ def read_conllu(file_):
|
||||||
sent = []
|
sent = []
|
||||||
doc = []
|
doc = []
|
||||||
for line in file_:
|
for line in file_:
|
||||||
if line.startswith('# newdoc'):
|
if line.startswith("# newdoc"):
|
||||||
if doc:
|
if doc:
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
doc = []
|
doc = []
|
||||||
elif line.startswith('#'):
|
elif line.startswith("#"):
|
||||||
continue
|
continue
|
||||||
elif not line.strip():
|
elif not line.strip():
|
||||||
if sent:
|
if sent:
|
||||||
doc.append(sent)
|
doc.append(sent)
|
||||||
sent = []
|
sent = []
|
||||||
else:
|
else:
|
||||||
sent.append(list(line.strip().split('\t')))
|
sent.append(list(line.strip().split("\t")))
|
||||||
if len(sent[-1]) != 10:
|
if len(sent[-1]) != 10:
|
||||||
print(repr(line))
|
print(repr(line))
|
||||||
raise ValueError
|
raise ValueError
|
||||||
|
@ -154,25 +163,29 @@ def _make_gold(nlp, text, sent_annots):
|
||||||
# Flatten the conll annotations, and adjust the head indices
|
# Flatten the conll annotations, and adjust the head indices
|
||||||
flat = defaultdict(list)
|
flat = defaultdict(list)
|
||||||
for sent in sent_annots:
|
for sent in sent_annots:
|
||||||
flat['heads'].extend(len(flat['words'])+head for head in sent['heads'])
|
flat["heads"].extend(len(flat["words"]) + head for head in sent["heads"])
|
||||||
for field in ['words', 'tags', 'deps', 'entities', 'spaces']:
|
for field in ["words", "tags", "deps", "entities", "spaces"]:
|
||||||
flat[field].extend(sent[field])
|
flat[field].extend(sent[field])
|
||||||
# Construct text if necessary
|
# Construct text if necessary
|
||||||
assert len(flat['words']) == len(flat['spaces'])
|
assert len(flat["words"]) == len(flat["spaces"])
|
||||||
if text is None:
|
if text is None:
|
||||||
text = ''.join(word+' '*space for word, space in zip(flat['words'], flat['spaces']))
|
text = "".join(
|
||||||
|
word + " " * space for word, space in zip(flat["words"], flat["spaces"])
|
||||||
|
)
|
||||||
doc = nlp.make_doc(text)
|
doc = nlp.make_doc(text)
|
||||||
flat.pop('spaces')
|
flat.pop("spaces")
|
||||||
gold = GoldParse(doc, **flat)
|
gold = GoldParse(doc, **flat)
|
||||||
return doc, gold
|
return doc, gold
|
||||||
|
|
||||||
|
|
||||||
#############################
|
#############################
|
||||||
# Data transforms for spaCy #
|
# Data transforms for spaCy #
|
||||||
#############################
|
#############################
|
||||||
|
|
||||||
|
|
||||||
def golds_to_gold_tuples(docs, golds):
|
def golds_to_gold_tuples(docs, golds):
|
||||||
'''Get out the annoying 'tuples' format used by begin_training, given the
|
"""Get out the annoying 'tuples' format used by begin_training, given the
|
||||||
GoldParse objects.'''
|
GoldParse objects."""
|
||||||
tuples = []
|
tuples = []
|
||||||
for doc, gold in zip(docs, golds):
|
for doc, gold in zip(docs, golds):
|
||||||
text = doc.text
|
text = doc.text
|
||||||
|
@ -186,15 +199,16 @@ def golds_to_gold_tuples(docs, golds):
|
||||||
# Evaluation #
|
# Evaluation #
|
||||||
##############
|
##############
|
||||||
|
|
||||||
|
|
||||||
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
||||||
with text_loc.open('r', encoding='utf8') as text_file:
|
with text_loc.open("r", encoding="utf8") as text_file:
|
||||||
texts = split_text(text_file.read())
|
texts = split_text(text_file.read())
|
||||||
docs = list(nlp.pipe(texts))
|
docs = list(nlp.pipe(texts))
|
||||||
with sys_loc.open('w', encoding='utf8') as out_file:
|
with sys_loc.open("w", encoding="utf8") as out_file:
|
||||||
write_conllu(docs, out_file)
|
write_conllu(docs, out_file)
|
||||||
with gold_loc.open('r', encoding='utf8') as gold_file:
|
with gold_loc.open("r", encoding="utf8") as gold_file:
|
||||||
gold_ud = conll17_ud_eval.load_conllu(gold_file)
|
gold_ud = conll17_ud_eval.load_conllu(gold_file)
|
||||||
with sys_loc.open('r', encoding='utf8') as sys_file:
|
with sys_loc.open("r", encoding="utf8") as sys_file:
|
||||||
sys_ud = conll17_ud_eval.load_conllu(sys_file)
|
sys_ud = conll17_ud_eval.load_conllu(sys_file)
|
||||||
scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
|
scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
|
||||||
return scores
|
return scores
|
||||||
|
@ -202,10 +216,10 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
||||||
|
|
||||||
def write_conllu(docs, file_):
|
def write_conllu(docs, file_):
|
||||||
merger = Matcher(docs[0].vocab)
|
merger = Matcher(docs[0].vocab)
|
||||||
merger.add('SUBTOK', None, [{'DEP': 'subtok', 'op': '+'}])
|
merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
matches = merger(doc)
|
matches = merger(doc)
|
||||||
spans = [doc[start:end+1] for _, start, end in matches]
|
spans = [doc[start : end + 1] for _, start, end in matches]
|
||||||
offsets = [(span.start_char, span.end_char) for span in spans]
|
offsets = [(span.start_char, span.end_char) for span in spans]
|
||||||
for start_char, end_char in offsets:
|
for start_char, end_char in offsets:
|
||||||
doc.merge(start_char, end_char)
|
doc.merge(start_char, end_char)
|
||||||
|
@ -214,58 +228,73 @@ def write_conllu(docs, file_):
|
||||||
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
|
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
|
||||||
file_.write("# text = {text}\n".format(text=sent.text))
|
file_.write("# text = {text}\n".format(text=sent.text))
|
||||||
for k, token in enumerate(sent):
|
for k, token in enumerate(sent):
|
||||||
file_.write(token._.get_conllu_lines(k) + '\n')
|
file_.write(token._.get_conllu_lines(k) + "\n")
|
||||||
file_.write('\n')
|
file_.write("\n")
|
||||||
|
|
||||||
|
|
||||||
def print_progress(itn, losses, ud_scores):
|
def print_progress(itn, losses, ud_scores):
|
||||||
fields = {
|
fields = {
|
||||||
'dep_loss': losses.get('parser', 0.0),
|
"dep_loss": losses.get("parser", 0.0),
|
||||||
'tag_loss': losses.get('tagger', 0.0),
|
"tag_loss": losses.get("tagger", 0.0),
|
||||||
'words': ud_scores['Words'].f1 * 100,
|
"words": ud_scores["Words"].f1 * 100,
|
||||||
'sents': ud_scores['Sentences'].f1 * 100,
|
"sents": ud_scores["Sentences"].f1 * 100,
|
||||||
'tags': ud_scores['XPOS'].f1 * 100,
|
"tags": ud_scores["XPOS"].f1 * 100,
|
||||||
'uas': ud_scores['UAS'].f1 * 100,
|
"uas": ud_scores["UAS"].f1 * 100,
|
||||||
'las': ud_scores['LAS'].f1 * 100,
|
"las": ud_scores["LAS"].f1 * 100,
|
||||||
}
|
}
|
||||||
header = ['Epoch', 'Loss', 'LAS', 'UAS', 'TAG', 'SENT', 'WORD']
|
header = ["Epoch", "Loss", "LAS", "UAS", "TAG", "SENT", "WORD"]
|
||||||
if itn == 0:
|
if itn == 0:
|
||||||
print('\t'.join(header))
|
print("\t".join(header))
|
||||||
tpl = '\t'.join((
|
tpl = "\t".join(
|
||||||
'{:d}',
|
(
|
||||||
'{dep_loss:.1f}',
|
"{:d}",
|
||||||
'{las:.1f}',
|
"{dep_loss:.1f}",
|
||||||
'{uas:.1f}',
|
"{las:.1f}",
|
||||||
'{tags:.1f}',
|
"{uas:.1f}",
|
||||||
'{sents:.1f}',
|
"{tags:.1f}",
|
||||||
'{words:.1f}',
|
"{sents:.1f}",
|
||||||
))
|
"{words:.1f}",
|
||||||
|
)
|
||||||
|
)
|
||||||
print(tpl.format(itn, **fields))
|
print(tpl.format(itn, **fields))
|
||||||
|
|
||||||
#def get_sent_conllu(sent, sent_id):
|
|
||||||
|
# def get_sent_conllu(sent, sent_id):
|
||||||
# lines = ["# sent_id = {sent_id}".format(sent_id=sent_id)]
|
# lines = ["# sent_id = {sent_id}".format(sent_id=sent_id)]
|
||||||
|
|
||||||
|
|
||||||
def get_token_conllu(token, i):
|
def get_token_conllu(token, i):
|
||||||
if token._.begins_fused:
|
if token._.begins_fused:
|
||||||
n = 1
|
n = 1
|
||||||
while token.nbor(n)._.inside_fused:
|
while token.nbor(n)._.inside_fused:
|
||||||
n += 1
|
n += 1
|
||||||
id_ = '%d-%d' % (i, i+n)
|
id_ = "%d-%d" % (i, i + n)
|
||||||
lines = [id_, token.text, '_', '_', '_', '_', '_', '_', '_', '_']
|
lines = [id_, token.text, "_", "_", "_", "_", "_", "_", "_", "_"]
|
||||||
else:
|
else:
|
||||||
lines = []
|
lines = []
|
||||||
if token.head.i == token.i:
|
if token.head.i == token.i:
|
||||||
head = 0
|
head = 0
|
||||||
else:
|
else:
|
||||||
head = i + (token.head.i - token.i) + 1
|
head = i + (token.head.i - token.i) + 1
|
||||||
fields = [str(i+1), token.text, token.lemma_, token.pos_, token.tag_, '_',
|
fields = [
|
||||||
str(head), token.dep_.lower(), '_', '_']
|
str(i + 1),
|
||||||
lines.append('\t'.join(fields))
|
token.text,
|
||||||
return '\n'.join(lines)
|
token.lemma_,
|
||||||
|
token.pos_,
|
||||||
|
token.tag_,
|
||||||
|
"_",
|
||||||
|
str(head),
|
||||||
|
token.dep_.lower(),
|
||||||
|
"_",
|
||||||
|
"_",
|
||||||
|
]
|
||||||
|
lines.append("\t".join(fields))
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
Token.set_extension('get_conllu_lines', method=get_token_conllu)
|
|
||||||
Token.set_extension('begins_fused', default=False)
|
Token.set_extension("get_conllu_lines", method=get_token_conllu)
|
||||||
Token.set_extension('inside_fused', default=False)
|
Token.set_extension("begins_fused", default=False)
|
||||||
|
Token.set_extension("inside_fused", default=False)
|
||||||
|
|
||||||
|
|
||||||
##################
|
##################
|
||||||
|
@ -274,31 +303,32 @@ Token.set_extension('inside_fused', default=False)
|
||||||
|
|
||||||
|
|
||||||
def load_nlp(corpus, config):
|
def load_nlp(corpus, config):
|
||||||
lang = corpus.split('_')[0]
|
lang = corpus.split("_")[0]
|
||||||
nlp = spacy.blank(lang)
|
nlp = spacy.blank(lang)
|
||||||
if config.vectors:
|
if config.vectors:
|
||||||
nlp.vocab.from_disk(config.vectors / 'vocab')
|
nlp.vocab.from_disk(config.vectors / "vocab")
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
def initialize_pipeline(nlp, docs, golds, config):
|
def initialize_pipeline(nlp, docs, golds, config):
|
||||||
nlp.add_pipe(nlp.create_pipe('parser'))
|
nlp.add_pipe(nlp.create_pipe("parser"))
|
||||||
if config.multitask_tag:
|
if config.multitask_tag:
|
||||||
nlp.parser.add_multitask_objective('tag')
|
nlp.parser.add_multitask_objective("tag")
|
||||||
if config.multitask_sent:
|
if config.multitask_sent:
|
||||||
nlp.parser.add_multitask_objective('sent_start')
|
nlp.parser.add_multitask_objective("sent_start")
|
||||||
nlp.parser.moves.add_action(2, 'subtok')
|
nlp.parser.moves.add_action(2, "subtok")
|
||||||
nlp.add_pipe(nlp.create_pipe('tagger'))
|
nlp.add_pipe(nlp.create_pipe("tagger"))
|
||||||
for gold in golds:
|
for gold in golds:
|
||||||
for tag in gold.tags:
|
for tag in gold.tags:
|
||||||
if tag is not None:
|
if tag is not None:
|
||||||
nlp.tagger.add_label(tag)
|
nlp.tagger.add_label(tag)
|
||||||
# Replace labels that didn't make the frequency cutoff
|
# Replace labels that didn't make the frequency cutoff
|
||||||
actions = set(nlp.parser.labels)
|
actions = set(nlp.parser.labels)
|
||||||
label_set = set([act.split('-')[1] for act in actions if '-' in act])
|
label_set = set([act.split("-")[1] for act in actions if "-" in act])
|
||||||
for gold in golds:
|
for gold in golds:
|
||||||
for i, label in enumerate(gold.labels):
|
for i, label in enumerate(gold.labels):
|
||||||
if label is not None and label not in label_set:
|
if label is not None and label not in label_set:
|
||||||
gold.labels[i] = label.split('||')[0]
|
gold.labels[i] = label.split("||")[0]
|
||||||
return nlp.begin_training(lambda: golds_to_gold_tuples(docs, golds))
|
return nlp.begin_training(lambda: golds_to_gold_tuples(docs, golds))
|
||||||
|
|
||||||
|
|
||||||
|
@ -306,6 +336,7 @@ def initialize_pipeline(nlp, docs, golds, config):
|
||||||
# Command line helpers #
|
# Command line helpers #
|
||||||
########################
|
########################
|
||||||
|
|
||||||
|
|
||||||
@attr.s
|
@attr.s
|
||||||
class Config(object):
|
class Config(object):
|
||||||
vectors = attr.ib(default=None)
|
vectors = attr.ib(default=None)
|
||||||
|
@ -318,7 +349,7 @@ class Config(object):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, loc):
|
def load(cls, loc):
|
||||||
with Path(loc).open('r', encoding='utf8') as file_:
|
with Path(loc).open("r", encoding="utf8") as file_:
|
||||||
cfg = json.load(file_)
|
cfg = json.load(file_)
|
||||||
return cls(**cfg)
|
return cls(**cfg)
|
||||||
|
|
||||||
|
@ -331,32 +362,36 @@ class Dataset(object):
|
||||||
self.text = None
|
self.text = None
|
||||||
for file_path in self.path.iterdir():
|
for file_path in self.path.iterdir():
|
||||||
name = file_path.parts[-1]
|
name = file_path.parts[-1]
|
||||||
if section in name and name.endswith('conllu'):
|
if section in name and name.endswith("conllu"):
|
||||||
self.conllu = file_path
|
self.conllu = file_path
|
||||||
elif section in name and name.endswith('txt'):
|
elif section in name and name.endswith("txt"):
|
||||||
self.text = file_path
|
self.text = file_path
|
||||||
if self.conllu is None:
|
if self.conllu is None:
|
||||||
msg = "Could not find .txt file in {path} for {section}"
|
msg = "Could not find .txt file in {path} for {section}"
|
||||||
raise IOError(msg.format(section=section, path=path))
|
raise IOError(msg.format(section=section, path=path))
|
||||||
if self.text is None:
|
if self.text is None:
|
||||||
msg = "Could not find .txt file in {path} for {section}"
|
msg = "Could not find .txt file in {path} for {section}"
|
||||||
self.lang = self.conllu.parts[-1].split('-')[0].split('_')[0]
|
self.lang = self.conllu.parts[-1].split("-")[0].split("_")[0]
|
||||||
|
|
||||||
|
|
||||||
class TreebankPaths(object):
|
class TreebankPaths(object):
|
||||||
def __init__(self, ud_path, treebank, **cfg):
|
def __init__(self, ud_path, treebank, **cfg):
|
||||||
self.train = Dataset(ud_path / treebank, 'train')
|
self.train = Dataset(ud_path / treebank, "train")
|
||||||
self.dev = Dataset(ud_path / treebank, 'dev')
|
self.dev = Dataset(ud_path / treebank, "dev")
|
||||||
self.lang = self.train.lang
|
self.lang = self.train.lang
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
|
ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
|
||||||
corpus=("UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
|
corpus=(
|
||||||
"positional", None, str),
|
"UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
|
||||||
|
"positional",
|
||||||
|
None,
|
||||||
|
str,
|
||||||
|
),
|
||||||
parses_dir=("Directory to write the development parses", "positional", None, Path),
|
parses_dir=("Directory to write the development parses", "positional", None, Path),
|
||||||
config=("Path to json formatted config file", "positional", None, Config.load),
|
config=("Path to json formatted config file", "positional", None, Config.load),
|
||||||
limit=("Size limit", "option", "n", int)
|
limit=("Size limit", "option", "n", int),
|
||||||
)
|
)
|
||||||
def main(ud_dir, parses_dir, config, corpus, limit=0):
|
def main(ud_dir, parses_dir, config, corpus, limit=0):
|
||||||
paths = TreebankPaths(ud_dir, corpus)
|
paths = TreebankPaths(ud_dir, corpus)
|
||||||
|
@ -365,8 +400,13 @@ def main(ud_dir, parses_dir, config, corpus, limit=0):
|
||||||
print("Train and evaluate", corpus, "using lang", paths.lang)
|
print("Train and evaluate", corpus, "using lang", paths.lang)
|
||||||
nlp = load_nlp(paths.lang, config)
|
nlp = load_nlp(paths.lang, config)
|
||||||
|
|
||||||
docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(),
|
docs, golds = read_data(
|
||||||
max_doc_length=config.max_doc_length, limit=limit)
|
nlp,
|
||||||
|
paths.train.conllu.open(),
|
||||||
|
paths.train.text.open(),
|
||||||
|
max_doc_length=config.max_doc_length,
|
||||||
|
limit=limit,
|
||||||
|
)
|
||||||
|
|
||||||
optimizer = initialize_pipeline(nlp, docs, golds, config)
|
optimizer = initialize_pipeline(nlp, docs, golds, config)
|
||||||
|
|
||||||
|
@ -379,14 +419,19 @@ def main(ud_dir, parses_dir, config, corpus, limit=0):
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
batch_docs, batch_gold = zip(*batch)
|
batch_docs, batch_gold = zip(*batch)
|
||||||
pbar.update(sum(len(doc) for doc in batch_docs))
|
pbar.update(sum(len(doc) for doc in batch_docs))
|
||||||
nlp.update(batch_docs, batch_gold, sgd=optimizer,
|
nlp.update(
|
||||||
drop=config.dropout, losses=losses)
|
batch_docs,
|
||||||
|
batch_gold,
|
||||||
out_path = parses_dir / corpus / 'epoch-{i}.conllu'.format(i=i)
|
sgd=optimizer,
|
||||||
|
drop=config.dropout,
|
||||||
|
losses=losses,
|
||||||
|
)
|
||||||
|
|
||||||
|
out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i)
|
||||||
with nlp.use_params(optimizer.averages):
|
with nlp.use_params(optimizer.averages):
|
||||||
scores = evaluate(nlp, paths.dev.text, paths.dev.conllu, out_path)
|
scores = evaluate(nlp, paths.dev.text, paths.dev.conllu, out_path)
|
||||||
print_progress(i, losses, scores)
|
print_progress(i, losses, scores)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
plac.call(main)
|
plac.call(main)
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
'''This example shows how to add a multi-task objective that is trained
|
"""This example shows how to add a multi-task objective that is trained
|
||||||
alongside the entity recognizer. This is an alternative to adding features
|
alongside the entity recognizer. This is an alternative to adding features
|
||||||
to the model.
|
to the model.
|
||||||
|
|
||||||
|
@ -19,7 +19,7 @@ The specific example here is not necessarily a good idea --- but it shows
|
||||||
how an arbitrary objective function for some word can be used.
|
how an arbitrary objective function for some word can be used.
|
||||||
|
|
||||||
Developed and tested for spaCy 2.0.6
|
Developed and tested for spaCy 2.0.6
|
||||||
'''
|
"""
|
||||||
import random
|
import random
|
||||||
import plac
|
import plac
|
||||||
import spacy
|
import spacy
|
||||||
|
@ -30,30 +30,29 @@ random.seed(0)
|
||||||
|
|
||||||
PWD = os.path.dirname(__file__)
|
PWD = os.path.dirname(__file__)
|
||||||
|
|
||||||
TRAIN_DATA = list(read_json_file(os.path.join(PWD, 'training-data.json')))
|
TRAIN_DATA = list(read_json_file(os.path.join(PWD, "training-data.json")))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def get_position_label(i, words, tags, heads, labels, ents):
|
def get_position_label(i, words, tags, heads, labels, ents):
|
||||||
'''Return labels indicating the position of the word in the document.
|
"""Return labels indicating the position of the word in the document.
|
||||||
'''
|
"""
|
||||||
if len(words) < 20:
|
if len(words) < 20:
|
||||||
return 'short-doc'
|
return "short-doc"
|
||||||
elif i == 0:
|
elif i == 0:
|
||||||
return 'first-word'
|
return "first-word"
|
||||||
elif i < 10:
|
elif i < 10:
|
||||||
return 'early-word'
|
return "early-word"
|
||||||
elif i < 20:
|
elif i < 20:
|
||||||
return 'mid-word'
|
return "mid-word"
|
||||||
elif i == len(words)-1:
|
elif i == len(words) - 1:
|
||||||
return 'last-word'
|
return "last-word"
|
||||||
else:
|
else:
|
||||||
return 'late-word'
|
return "late-word"
|
||||||
|
|
||||||
|
|
||||||
def main(n_iter=10):
|
def main(n_iter=10):
|
||||||
nlp = spacy.blank('en')
|
nlp = spacy.blank("en")
|
||||||
ner = nlp.create_pipe('ner')
|
ner = nlp.create_pipe("ner")
|
||||||
ner.add_multitask_objective(get_position_label)
|
ner.add_multitask_objective(get_position_label)
|
||||||
nlp.add_pipe(ner)
|
nlp.add_pipe(ner)
|
||||||
|
|
||||||
|
@ -71,15 +70,16 @@ def main(n_iter=10):
|
||||||
[gold], # batch of annotations
|
[gold], # batch of annotations
|
||||||
drop=0.2, # dropout - make it harder to memorise data
|
drop=0.2, # dropout - make it harder to memorise data
|
||||||
sgd=optimizer, # callable to update weights
|
sgd=optimizer, # callable to update weights
|
||||||
losses=losses)
|
losses=losses,
|
||||||
print(losses.get('nn_labeller', 0.0), losses['ner'])
|
)
|
||||||
|
print(losses.get("nn_labeller", 0.0), losses["ner"])
|
||||||
|
|
||||||
# test the trained model
|
# test the trained model
|
||||||
for text, _ in TRAIN_DATA:
|
for text, _ in TRAIN_DATA:
|
||||||
doc = nlp(text)
|
doc = nlp(text)
|
||||||
print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
|
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
|
||||||
print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
|
print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
plac.call(main)
|
plac.call(main)
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
'''This script is experimental.
|
"""This script is experimental.
|
||||||
|
|
||||||
Try pre-training the CNN component of the text categorizer using a cheap
|
Try pre-training the CNN component of the text categorizer using a cheap
|
||||||
language modelling-like objective. Specifically, we load pre-trained vectors
|
language modelling-like objective. Specifically, we load pre-trained vectors
|
||||||
|
@ -12,7 +12,7 @@ To evaluate the technique, we're pre-training with the 50k texts from the IMDB
|
||||||
corpus, and then training with only 100 labels. Note that it's a bit dirty to
|
corpus, and then training with only 100 labels. Note that it's a bit dirty to
|
||||||
pre-train with the development data, but also not *so* terrible: we're not using
|
pre-train with the development data, but also not *so* terrible: we're not using
|
||||||
the development labels, after all --- only the unlabelled text.
|
the development labels, after all --- only the unlabelled text.
|
||||||
'''
|
"""
|
||||||
import plac
|
import plac
|
||||||
import random
|
import random
|
||||||
import spacy
|
import spacy
|
||||||
|
@ -46,8 +46,8 @@ def load_textcat_data(limit=0):
|
||||||
train_data = train_data[-limit:]
|
train_data = train_data[-limit:]
|
||||||
texts, labels = zip(*train_data)
|
texts, labels = zip(*train_data)
|
||||||
eval_texts, eval_labels = zip(*eval_data)
|
eval_texts, eval_labels = zip(*eval_data)
|
||||||
cats = [{'POSITIVE': bool(y), 'NEGATIVE': not bool(y)} for y in labels]
|
cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
|
||||||
eval_cats = [{'POSITIVE': bool(y), 'NEGATIVE': not bool(y)} for y in eval_labels]
|
eval_cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in eval_labels]
|
||||||
return (texts, cats), (eval_texts, eval_cats)
|
return (texts, cats), (eval_texts, eval_cats)
|
||||||
|
|
||||||
|
|
||||||
|
@ -57,6 +57,7 @@ def prefer_gpu():
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
import cupy.random
|
import cupy.random
|
||||||
|
|
||||||
cupy.random.seed(0)
|
cupy.random.seed(0)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
@ -68,7 +69,7 @@ def build_textcat_model(tok2vec, nr_class, width):
|
||||||
from thinc.misc import Residual, LayerNorm
|
from thinc.misc import Residual, LayerNorm
|
||||||
from spacy._ml import logistic, zero_init
|
from spacy._ml import logistic, zero_init
|
||||||
|
|
||||||
with Model.define_operators({'>>': chain}):
|
with Model.define_operators({">>": chain}):
|
||||||
model = (
|
model = (
|
||||||
tok2vec
|
tok2vec
|
||||||
>> flatten_add_lengths
|
>> flatten_add_lengths
|
||||||
|
@ -78,27 +79,35 @@ def build_textcat_model(tok2vec, nr_class, width):
|
||||||
model.tok2vec = tok2vec
|
model.tok2vec = tok2vec
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
def block_gradients(model):
|
def block_gradients(model):
|
||||||
from thinc.api import wrap
|
from thinc.api import wrap
|
||||||
def forward(X, drop=0.):
|
|
||||||
|
def forward(X, drop=0.0):
|
||||||
Y, _ = model.begin_update(X, drop=drop)
|
Y, _ = model.begin_update(X, drop=drop)
|
||||||
return Y, None
|
return Y, None
|
||||||
|
|
||||||
return wrap(forward, model)
|
return wrap(forward, model)
|
||||||
|
|
||||||
|
|
||||||
def create_pipeline(width, embed_size, vectors_model):
|
def create_pipeline(width, embed_size, vectors_model):
|
||||||
print("Load vectors")
|
print("Load vectors")
|
||||||
nlp = spacy.load(vectors_model)
|
nlp = spacy.load(vectors_model)
|
||||||
print("Start training")
|
print("Start training")
|
||||||
textcat = TextCategorizer(nlp.vocab,
|
textcat = TextCategorizer(
|
||||||
labels=['POSITIVE', 'NEGATIVE'],
|
nlp.vocab,
|
||||||
|
labels=["POSITIVE", "NEGATIVE"],
|
||||||
model=build_textcat_model(
|
model=build_textcat_model(
|
||||||
Tok2Vec(width=width, embed_size=embed_size), 2, width))
|
Tok2Vec(width=width, embed_size=embed_size), 2, width
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
nlp.add_pipe(textcat)
|
nlp.add_pipe(textcat)
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
def train_tensorizer(nlp, texts, dropout, n_iter):
|
def train_tensorizer(nlp, texts, dropout, n_iter):
|
||||||
tensorizer = nlp.create_pipe('tensorizer')
|
tensorizer = nlp.create_pipe("tensorizer")
|
||||||
nlp.add_pipe(tensorizer)
|
nlp.add_pipe(tensorizer)
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.begin_training()
|
||||||
for i in range(n_iter):
|
for i in range(n_iter):
|
||||||
|
@ -109,36 +118,43 @@ def train_tensorizer(nlp, texts, dropout, n_iter):
|
||||||
print(losses)
|
print(losses)
|
||||||
return optimizer
|
return optimizer
|
||||||
|
|
||||||
|
|
||||||
def train_textcat(nlp, n_texts, n_iter=10):
|
def train_textcat(nlp, n_texts, n_iter=10):
|
||||||
textcat = nlp.get_pipe('textcat')
|
textcat = nlp.get_pipe("textcat")
|
||||||
tok2vec_weights = textcat.model.tok2vec.to_bytes()
|
tok2vec_weights = textcat.model.tok2vec.to_bytes()
|
||||||
(train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts)
|
(train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts)
|
||||||
print("Using {} examples ({} training, {} evaluation)"
|
print(
|
||||||
.format(n_texts, len(train_texts), len(dev_texts)))
|
"Using {} examples ({} training, {} evaluation)".format(
|
||||||
train_data = list(zip(train_texts,
|
n_texts, len(train_texts), len(dev_texts)
|
||||||
[{'cats': cats} for cats in train_cats]))
|
)
|
||||||
|
)
|
||||||
|
train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
|
||||||
|
|
||||||
# get names of other pipes to disable them during training
|
# get names of other pipes to disable them during training
|
||||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
|
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
|
||||||
with nlp.disable_pipes(*other_pipes): # only train textcat
|
with nlp.disable_pipes(*other_pipes): # only train textcat
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.begin_training()
|
||||||
textcat.model.tok2vec.from_bytes(tok2vec_weights)
|
textcat.model.tok2vec.from_bytes(tok2vec_weights)
|
||||||
print("Training the model...")
|
print("Training the model...")
|
||||||
print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
|
print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
|
||||||
for i in range(n_iter):
|
for i in range(n_iter):
|
||||||
losses = {'textcat': 0.0}
|
losses = {"textcat": 0.0}
|
||||||
# batch up the examples using spaCy's minibatch
|
# batch up the examples using spaCy's minibatch
|
||||||
batches = minibatch(tqdm.tqdm(train_data), size=2)
|
batches = minibatch(tqdm.tqdm(train_data), size=2)
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
texts, annotations = zip(*batch)
|
texts, annotations = zip(*batch)
|
||||||
nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
|
nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
|
||||||
losses=losses)
|
|
||||||
with textcat.model.use_params(optimizer.averages):
|
with textcat.model.use_params(optimizer.averages):
|
||||||
# evaluate on the dev data split off in load_data()
|
# evaluate on the dev data split off in load_data()
|
||||||
scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats)
|
scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats)
|
||||||
print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}' # print a simple table
|
print(
|
||||||
.format(losses['textcat'], scores['textcat_p'],
|
"{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format( # print a simple table
|
||||||
scores['textcat_r'], scores['textcat_f']))
|
losses["textcat"],
|
||||||
|
scores["textcat_p"],
|
||||||
|
scores["textcat_r"],
|
||||||
|
scores["textcat_f"],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def evaluate_textcat(tokenizer, textcat, texts, cats):
|
def evaluate_textcat(tokenizer, textcat, texts, cats):
|
||||||
|
@ -153,9 +169,9 @@ def evaluate_textcat(tokenizer, textcat, texts, cats):
|
||||||
if label not in gold:
|
if label not in gold:
|
||||||
continue
|
continue
|
||||||
if score >= 0.5 and gold[label] >= 0.5:
|
if score >= 0.5 and gold[label] >= 0.5:
|
||||||
tp += 1.
|
tp += 1.0
|
||||||
elif score >= 0.5 and gold[label] < 0.5:
|
elif score >= 0.5 and gold[label] < 0.5:
|
||||||
fp += 1.
|
fp += 1.0
|
||||||
elif score < 0.5 and gold[label] < 0.5:
|
elif score < 0.5 and gold[label] < 0.5:
|
||||||
tn += 1
|
tn += 1
|
||||||
elif score < 0.5 and gold[label] >= 0.5:
|
elif score < 0.5 and gold[label] >= 0.5:
|
||||||
|
@ -163,8 +179,7 @@ def evaluate_textcat(tokenizer, textcat, texts, cats):
|
||||||
precision = tp / (tp + fp)
|
precision = tp / (tp + fp)
|
||||||
recall = tp / (tp + fn)
|
recall = tp / (tp + fn)
|
||||||
f_score = 2 * (precision * recall) / (precision + recall)
|
f_score = 2 * (precision * recall) / (precision + recall)
|
||||||
return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}
|
return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
|
@ -173,10 +188,16 @@ def evaluate_textcat(tokenizer, textcat, texts, cats):
|
||||||
pretrain_iters=("Number of iterations to pretrain", "option", "pn", int),
|
pretrain_iters=("Number of iterations to pretrain", "option", "pn", int),
|
||||||
train_iters=("Number of iterations to pretrain", "option", "tn", int),
|
train_iters=("Number of iterations to pretrain", "option", "tn", int),
|
||||||
train_examples=("Number of labelled examples", "option", "eg", int),
|
train_examples=("Number of labelled examples", "option", "eg", int),
|
||||||
vectors_model=("Name or path to vectors model to learn from")
|
vectors_model=("Name or path to vectors model to learn from"),
|
||||||
)
|
)
|
||||||
def main(width, embed_size, vectors_model,
|
def main(
|
||||||
pretrain_iters=30, train_iters=30, train_examples=1000):
|
width,
|
||||||
|
embed_size,
|
||||||
|
vectors_model,
|
||||||
|
pretrain_iters=30,
|
||||||
|
train_iters=30,
|
||||||
|
train_examples=1000,
|
||||||
|
):
|
||||||
random.seed(0)
|
random.seed(0)
|
||||||
numpy.random.seed(0)
|
numpy.random.seed(0)
|
||||||
use_gpu = prefer_gpu()
|
use_gpu = prefer_gpu()
|
||||||
|
@ -190,5 +211,6 @@ def main(width, embed_size, vectors_model,
|
||||||
print("Train textcat")
|
print("Train textcat")
|
||||||
train_textcat(nlp, train_examples, n_iter=train_iters)
|
train_textcat(nlp, train_examples, n_iter=train_iters)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
|
if __name__ == "__main__":
|
||||||
plac.call(main)
|
plac.call(main)
|
||||||
|
|
|
@ -29,73 +29,113 @@ from spacy.util import minibatch, compounding
|
||||||
# training data: texts, heads and dependency labels
|
# training data: texts, heads and dependency labels
|
||||||
# for no relation, we simply chose an arbitrary dependency label, e.g. '-'
|
# for no relation, we simply chose an arbitrary dependency label, e.g. '-'
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
("find a cafe with great wifi", {
|
(
|
||||||
'heads': [0, 2, 0, 5, 5, 2], # index of token head
|
"find a cafe with great wifi",
|
||||||
'deps': ['ROOT', '-', 'PLACE', '-', 'QUALITY', 'ATTRIBUTE']
|
{
|
||||||
}),
|
"heads": [0, 2, 0, 5, 5, 2], # index of token head
|
||||||
("find a hotel near the beach", {
|
"deps": ["ROOT", "-", "PLACE", "-", "QUALITY", "ATTRIBUTE"],
|
||||||
'heads': [0, 2, 0, 5, 5, 2],
|
},
|
||||||
'deps': ['ROOT', '-', 'PLACE', 'QUALITY', '-', 'ATTRIBUTE']
|
),
|
||||||
}),
|
(
|
||||||
("find me the closest gym that's open late", {
|
"find a hotel near the beach",
|
||||||
'heads': [0, 0, 4, 4, 0, 6, 4, 6, 6],
|
{
|
||||||
'deps': ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'ATTRIBUTE', 'TIME']
|
"heads": [0, 2, 0, 5, 5, 2],
|
||||||
}),
|
"deps": ["ROOT", "-", "PLACE", "QUALITY", "-", "ATTRIBUTE"],
|
||||||
("show me the cheapest store that sells flowers", {
|
},
|
||||||
'heads': [0, 0, 4, 4, 0, 4, 4, 4], # attach "flowers" to store!
|
),
|
||||||
'deps': ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'PRODUCT']
|
(
|
||||||
}),
|
"find me the closest gym that's open late",
|
||||||
("find a nice restaurant in london", {
|
{
|
||||||
'heads': [0, 3, 3, 0, 3, 3],
|
"heads": [0, 0, 4, 4, 0, 6, 4, 6, 6],
|
||||||
'deps': ['ROOT', '-', 'QUALITY', 'PLACE', '-', 'LOCATION']
|
"deps": [
|
||||||
}),
|
"ROOT",
|
||||||
("show me the coolest hostel in berlin", {
|
"-",
|
||||||
'heads': [0, 0, 4, 4, 0, 4, 4],
|
"-",
|
||||||
'deps': ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', 'LOCATION']
|
"QUALITY",
|
||||||
}),
|
"PLACE",
|
||||||
("find a good italian restaurant near work", {
|
"-",
|
||||||
'heads': [0, 4, 4, 4, 0, 4, 5],
|
"-",
|
||||||
'deps': ['ROOT', '-', 'QUALITY', 'ATTRIBUTE', 'PLACE', 'ATTRIBUTE', 'LOCATION']
|
"ATTRIBUTE",
|
||||||
})
|
"TIME",
|
||||||
|
],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"show me the cheapest store that sells flowers",
|
||||||
|
{
|
||||||
|
"heads": [0, 0, 4, 4, 0, 4, 4, 4], # attach "flowers" to store!
|
||||||
|
"deps": ["ROOT", "-", "-", "QUALITY", "PLACE", "-", "-", "PRODUCT"],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"find a nice restaurant in london",
|
||||||
|
{
|
||||||
|
"heads": [0, 3, 3, 0, 3, 3],
|
||||||
|
"deps": ["ROOT", "-", "QUALITY", "PLACE", "-", "LOCATION"],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"show me the coolest hostel in berlin",
|
||||||
|
{
|
||||||
|
"heads": [0, 0, 4, 4, 0, 4, 4],
|
||||||
|
"deps": ["ROOT", "-", "-", "QUALITY", "PLACE", "-", "LOCATION"],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"find a good italian restaurant near work",
|
||||||
|
{
|
||||||
|
"heads": [0, 4, 4, 4, 0, 4, 5],
|
||||||
|
"deps": [
|
||||||
|
"ROOT",
|
||||||
|
"-",
|
||||||
|
"QUALITY",
|
||||||
|
"ATTRIBUTE",
|
||||||
|
"PLACE",
|
||||||
|
"ATTRIBUTE",
|
||||||
|
"LOCATION",
|
||||||
|
],
|
||||||
|
},
|
||||||
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
|
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
|
||||||
output_dir=("Optional output directory", "option", "o", Path),
|
output_dir=("Optional output directory", "option", "o", Path),
|
||||||
n_iter=("Number of training iterations", "option", "n", int))
|
n_iter=("Number of training iterations", "option", "n", int),
|
||||||
|
)
|
||||||
def main(model=None, output_dir=None, n_iter=15):
|
def main(model=None, output_dir=None, n_iter=15):
|
||||||
"""Load the model, set up the pipeline and train the parser."""
|
"""Load the model, set up the pipeline and train the parser."""
|
||||||
if model is not None:
|
if model is not None:
|
||||||
nlp = spacy.load(model) # load existing spaCy model
|
nlp = spacy.load(model) # load existing spaCy model
|
||||||
print("Loaded model '%s'" % model)
|
print("Loaded model '%s'" % model)
|
||||||
else:
|
else:
|
||||||
nlp = spacy.blank('en') # create blank Language class
|
nlp = spacy.blank("en") # create blank Language class
|
||||||
print("Created blank 'en' model")
|
print("Created blank 'en' model")
|
||||||
|
|
||||||
# We'll use the built-in dependency parser class, but we want to create a
|
# We'll use the built-in dependency parser class, but we want to create a
|
||||||
# fresh instance – just in case.
|
# fresh instance – just in case.
|
||||||
if 'parser' in nlp.pipe_names:
|
if "parser" in nlp.pipe_names:
|
||||||
nlp.remove_pipe('parser')
|
nlp.remove_pipe("parser")
|
||||||
parser = nlp.create_pipe('parser')
|
parser = nlp.create_pipe("parser")
|
||||||
nlp.add_pipe(parser, first=True)
|
nlp.add_pipe(parser, first=True)
|
||||||
|
|
||||||
for text, annotations in TRAIN_DATA:
|
for text, annotations in TRAIN_DATA:
|
||||||
for dep in annotations.get('deps', []):
|
for dep in annotations.get("deps", []):
|
||||||
parser.add_label(dep)
|
parser.add_label(dep)
|
||||||
|
|
||||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser']
|
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "parser"]
|
||||||
with nlp.disable_pipes(*other_pipes): # only train parser
|
with nlp.disable_pipes(*other_pipes): # only train parser
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.begin_training()
|
||||||
for itn in range(n_iter):
|
for itn in range(n_iter):
|
||||||
random.shuffle(TRAIN_DATA)
|
random.shuffle(TRAIN_DATA)
|
||||||
losses = {}
|
losses = {}
|
||||||
# batch up the examples using spaCy's minibatch
|
# batch up the examples using spaCy's minibatch
|
||||||
batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
|
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
texts, annotations = zip(*batch)
|
texts, annotations = zip(*batch)
|
||||||
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
|
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
|
||||||
print('Losses', losses)
|
print("Losses", losses)
|
||||||
|
|
||||||
# test the trained model
|
# test the trained model
|
||||||
test_model(nlp)
|
test_model(nlp)
|
||||||
|
@ -115,16 +155,18 @@ def main(model=None, output_dir=None, n_iter=15):
|
||||||
|
|
||||||
|
|
||||||
def test_model(nlp):
|
def test_model(nlp):
|
||||||
texts = ["find a hotel with good wifi",
|
texts = [
|
||||||
"find me the cheapest gym near work",
|
"find a hotel with good wifi",
|
||||||
"show me the best hotel in berlin"]
|
"find me the cheapest gym near work",
|
||||||
|
"show me the best hotel in berlin",
|
||||||
|
]
|
||||||
docs = nlp.pipe(texts)
|
docs = nlp.pipe(texts)
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
print(doc.text)
|
print(doc.text)
|
||||||
print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != '-'])
|
print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != "-"])
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
plac.call(main)
|
plac.call(main)
|
||||||
|
|
||||||
# Expected output:
|
# Expected output:
|
||||||
|
|
|
@ -20,51 +20,48 @@ from spacy.util import minibatch, compounding
|
||||||
|
|
||||||
# training data
|
# training data
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
('Who is Shaka Khan?', {
|
("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
|
||||||
'entities': [(7, 17, 'PERSON')]
|
("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
|
||||||
}),
|
|
||||||
('I like London and Berlin.', {
|
|
||||||
'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]
|
|
||||||
})
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
|
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
|
||||||
output_dir=("Optional output directory", "option", "o", Path),
|
output_dir=("Optional output directory", "option", "o", Path),
|
||||||
n_iter=("Number of training iterations", "option", "n", int))
|
n_iter=("Number of training iterations", "option", "n", int),
|
||||||
|
)
|
||||||
def main(model=None, output_dir=None, n_iter=100):
|
def main(model=None, output_dir=None, n_iter=100):
|
||||||
"""Load the model, set up the pipeline and train the entity recognizer."""
|
"""Load the model, set up the pipeline and train the entity recognizer."""
|
||||||
if model is not None:
|
if model is not None:
|
||||||
nlp = spacy.load(model) # load existing spaCy model
|
nlp = spacy.load(model) # load existing spaCy model
|
||||||
print("Loaded model '%s'" % model)
|
print("Loaded model '%s'" % model)
|
||||||
else:
|
else:
|
||||||
nlp = spacy.blank('en') # create blank Language class
|
nlp = spacy.blank("en") # create blank Language class
|
||||||
print("Created blank 'en' model")
|
print("Created blank 'en' model")
|
||||||
|
|
||||||
# create the built-in pipeline components and add them to the pipeline
|
# create the built-in pipeline components and add them to the pipeline
|
||||||
# nlp.create_pipe works for built-ins that are registered with spaCy
|
# nlp.create_pipe works for built-ins that are registered with spaCy
|
||||||
if 'ner' not in nlp.pipe_names:
|
if "ner" not in nlp.pipe_names:
|
||||||
ner = nlp.create_pipe('ner')
|
ner = nlp.create_pipe("ner")
|
||||||
nlp.add_pipe(ner, last=True)
|
nlp.add_pipe(ner, last=True)
|
||||||
# otherwise, get it so we can add labels
|
# otherwise, get it so we can add labels
|
||||||
else:
|
else:
|
||||||
ner = nlp.get_pipe('ner')
|
ner = nlp.get_pipe("ner")
|
||||||
|
|
||||||
# add labels
|
# add labels
|
||||||
for _, annotations in TRAIN_DATA:
|
for _, annotations in TRAIN_DATA:
|
||||||
for ent in annotations.get('entities'):
|
for ent in annotations.get("entities"):
|
||||||
ner.add_label(ent[2])
|
ner.add_label(ent[2])
|
||||||
|
|
||||||
# get names of other pipes to disable them during training
|
# get names of other pipes to disable them during training
|
||||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
|
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
|
||||||
with nlp.disable_pipes(*other_pipes): # only train NER
|
with nlp.disable_pipes(*other_pipes): # only train NER
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.begin_training()
|
||||||
for itn in range(n_iter):
|
for itn in range(n_iter):
|
||||||
random.shuffle(TRAIN_DATA)
|
random.shuffle(TRAIN_DATA)
|
||||||
losses = {}
|
losses = {}
|
||||||
# batch up the examples using spaCy's minibatch
|
# batch up the examples using spaCy's minibatch
|
||||||
batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
|
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
texts, annotations = zip(*batch)
|
texts, annotations = zip(*batch)
|
||||||
nlp.update(
|
nlp.update(
|
||||||
|
@ -72,14 +69,15 @@ def main(model=None, output_dir=None, n_iter=100):
|
||||||
annotations, # batch of annotations
|
annotations, # batch of annotations
|
||||||
drop=0.5, # dropout - make it harder to memorise data
|
drop=0.5, # dropout - make it harder to memorise data
|
||||||
sgd=optimizer, # callable to update weights
|
sgd=optimizer, # callable to update weights
|
||||||
losses=losses)
|
losses=losses,
|
||||||
print('Losses', losses)
|
)
|
||||||
|
print("Losses", losses)
|
||||||
|
|
||||||
# test the trained model
|
# test the trained model
|
||||||
for text, _ in TRAIN_DATA:
|
for text, _ in TRAIN_DATA:
|
||||||
doc = nlp(text)
|
doc = nlp(text)
|
||||||
print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
|
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
|
||||||
print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
|
print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
|
||||||
|
|
||||||
# save model to output directory
|
# save model to output directory
|
||||||
if output_dir is not None:
|
if output_dir is not None:
|
||||||
|
@ -94,11 +92,11 @@ def main(model=None, output_dir=None, n_iter=100):
|
||||||
nlp2 = spacy.load(output_dir)
|
nlp2 = spacy.load(output_dir)
|
||||||
for text, _ in TRAIN_DATA:
|
for text, _ in TRAIN_DATA:
|
||||||
doc = nlp2(text)
|
doc = nlp2(text)
|
||||||
print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
|
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
|
||||||
print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
|
print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
plac.call(main)
|
plac.call(main)
|
||||||
|
|
||||||
# Expected output:
|
# Expected output:
|
||||||
|
|
|
@ -35,7 +35,7 @@ from spacy.util import minibatch, compounding
|
||||||
|
|
||||||
|
|
||||||
# new entity label
|
# new entity label
|
||||||
LABEL = 'ANIMAL'
|
LABEL = "ANIMAL"
|
||||||
|
|
||||||
# training data
|
# training data
|
||||||
# Note: If you're using an existing model, make sure to mix in examples of
|
# Note: If you're using an existing model, make sure to mix in examples of
|
||||||
|
@ -43,29 +43,21 @@ LABEL = 'ANIMAL'
|
||||||
# model might learn the new type, but "forget" what it previously knew.
|
# model might learn the new type, but "forget" what it previously knew.
|
||||||
# https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
|
# https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
("Horses are too tall and they pretend to care about your feelings", {
|
(
|
||||||
'entities': [(0, 6, 'ANIMAL')]
|
"Horses are too tall and they pretend to care about your feelings",
|
||||||
}),
|
{"entities": [(0, 6, "ANIMAL")]},
|
||||||
|
),
|
||||||
("Do they bite?", {
|
("Do they bite?", {"entities": []}),
|
||||||
'entities': []
|
(
|
||||||
}),
|
"horses are too tall and they pretend to care about your feelings",
|
||||||
|
{"entities": [(0, 6, "ANIMAL")]},
|
||||||
("horses are too tall and they pretend to care about your feelings", {
|
),
|
||||||
'entities': [(0, 6, 'ANIMAL')]
|
("horses pretend to care about your feelings", {"entities": [(0, 6, "ANIMAL")]}),
|
||||||
}),
|
(
|
||||||
|
"they pretend to care about your feelings, those horses",
|
||||||
("horses pretend to care about your feelings", {
|
{"entities": [(48, 54, "ANIMAL")]},
|
||||||
'entities': [(0, 6, 'ANIMAL')]
|
),
|
||||||
}),
|
("horses?", {"entities": [(0, 6, "ANIMAL")]}),
|
||||||
|
|
||||||
("they pretend to care about your feelings, those horses", {
|
|
||||||
'entities': [(48, 54, 'ANIMAL')]
|
|
||||||
}),
|
|
||||||
|
|
||||||
("horses?", {
|
|
||||||
'entities': [(0, 6, 'ANIMAL')]
|
|
||||||
})
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -73,25 +65,26 @@ TRAIN_DATA = [
|
||||||
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
|
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
|
||||||
new_model_name=("New model name for model meta.", "option", "nm", str),
|
new_model_name=("New model name for model meta.", "option", "nm", str),
|
||||||
output_dir=("Optional output directory", "option", "o", Path),
|
output_dir=("Optional output directory", "option", "o", Path),
|
||||||
n_iter=("Number of training iterations", "option", "n", int))
|
n_iter=("Number of training iterations", "option", "n", int),
|
||||||
def main(model=None, new_model_name='animal', output_dir=None, n_iter=10):
|
)
|
||||||
|
def main(model=None, new_model_name="animal", output_dir=None, n_iter=10):
|
||||||
"""Set up the pipeline and entity recognizer, and train the new entity."""
|
"""Set up the pipeline and entity recognizer, and train the new entity."""
|
||||||
if model is not None:
|
if model is not None:
|
||||||
nlp = spacy.load(model) # load existing spaCy model
|
nlp = spacy.load(model) # load existing spaCy model
|
||||||
print("Loaded model '%s'" % model)
|
print("Loaded model '%s'" % model)
|
||||||
else:
|
else:
|
||||||
nlp = spacy.blank('en') # create blank Language class
|
nlp = spacy.blank("en") # create blank Language class
|
||||||
print("Created blank 'en' model")
|
print("Created blank 'en' model")
|
||||||
# Add entity recognizer to model if it's not in the pipeline
|
# Add entity recognizer to model if it's not in the pipeline
|
||||||
# nlp.create_pipe works for built-ins that are registered with spaCy
|
# nlp.create_pipe works for built-ins that are registered with spaCy
|
||||||
if 'ner' not in nlp.pipe_names:
|
if "ner" not in nlp.pipe_names:
|
||||||
ner = nlp.create_pipe('ner')
|
ner = nlp.create_pipe("ner")
|
||||||
nlp.add_pipe(ner)
|
nlp.add_pipe(ner)
|
||||||
# otherwise, get it, so we can add labels to it
|
# otherwise, get it, so we can add labels to it
|
||||||
else:
|
else:
|
||||||
ner = nlp.get_pipe('ner')
|
ner = nlp.get_pipe("ner")
|
||||||
|
|
||||||
ner.add_label(LABEL) # add new entity label to entity recognizer
|
ner.add_label(LABEL) # add new entity label to entity recognizer
|
||||||
if model is None:
|
if model is None:
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.begin_training()
|
||||||
else:
|
else:
|
||||||
|
@ -100,21 +93,20 @@ def main(model=None, new_model_name='animal', output_dir=None, n_iter=10):
|
||||||
optimizer = nlp.entity.create_optimizer()
|
optimizer = nlp.entity.create_optimizer()
|
||||||
|
|
||||||
# get names of other pipes to disable them during training
|
# get names of other pipes to disable them during training
|
||||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
|
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
|
||||||
with nlp.disable_pipes(*other_pipes): # only train NER
|
with nlp.disable_pipes(*other_pipes): # only train NER
|
||||||
for itn in range(n_iter):
|
for itn in range(n_iter):
|
||||||
random.shuffle(TRAIN_DATA)
|
random.shuffle(TRAIN_DATA)
|
||||||
losses = {}
|
losses = {}
|
||||||
# batch up the examples using spaCy's minibatch
|
# batch up the examples using spaCy's minibatch
|
||||||
batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
|
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
texts, annotations = zip(*batch)
|
texts, annotations = zip(*batch)
|
||||||
nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
|
nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
|
||||||
losses=losses)
|
print("Losses", losses)
|
||||||
print('Losses', losses)
|
|
||||||
|
|
||||||
# test the trained model
|
# test the trained model
|
||||||
test_text = 'Do you like horses?'
|
test_text = "Do you like horses?"
|
||||||
doc = nlp(test_text)
|
doc = nlp(test_text)
|
||||||
print("Entities in '%s'" % test_text)
|
print("Entities in '%s'" % test_text)
|
||||||
for ent in doc.ents:
|
for ent in doc.ents:
|
||||||
|
@ -125,7 +117,7 @@ def main(model=None, new_model_name='animal', output_dir=None, n_iter=10):
|
||||||
output_dir = Path(output_dir)
|
output_dir = Path(output_dir)
|
||||||
if not output_dir.exists():
|
if not output_dir.exists():
|
||||||
output_dir.mkdir()
|
output_dir.mkdir()
|
||||||
nlp.meta['name'] = new_model_name # rename model
|
nlp.meta["name"] = new_model_name # rename model
|
||||||
nlp.to_disk(output_dir)
|
nlp.to_disk(output_dir)
|
||||||
print("Saved model to", output_dir)
|
print("Saved model to", output_dir)
|
||||||
|
|
||||||
|
@ -137,5 +129,5 @@ def main(model=None, new_model_name='animal', output_dir=None, n_iter=10):
|
||||||
print(ent.label_, ent.text)
|
print(ent.label_, ent.text)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
plac.call(main)
|
plac.call(main)
|
||||||
|
|
|
@ -18,62 +18,69 @@ from spacy.util import minibatch, compounding
|
||||||
|
|
||||||
# training data
|
# training data
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
("They trade mortgage-backed securities.", {
|
(
|
||||||
'heads': [1, 1, 4, 4, 5, 1, 1],
|
"They trade mortgage-backed securities.",
|
||||||
'deps': ['nsubj', 'ROOT', 'compound', 'punct', 'nmod', 'dobj', 'punct']
|
{
|
||||||
}),
|
"heads": [1, 1, 4, 4, 5, 1, 1],
|
||||||
("I like London and Berlin.", {
|
"deps": ["nsubj", "ROOT", "compound", "punct", "nmod", "dobj", "punct"],
|
||||||
'heads': [1, 1, 1, 2, 2, 1],
|
},
|
||||||
'deps': ['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct']
|
),
|
||||||
})
|
(
|
||||||
|
"I like London and Berlin.",
|
||||||
|
{
|
||||||
|
"heads": [1, 1, 1, 2, 2, 1],
|
||||||
|
"deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],
|
||||||
|
},
|
||||||
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
|
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
|
||||||
output_dir=("Optional output directory", "option", "o", Path),
|
output_dir=("Optional output directory", "option", "o", Path),
|
||||||
n_iter=("Number of training iterations", "option", "n", int))
|
n_iter=("Number of training iterations", "option", "n", int),
|
||||||
|
)
|
||||||
def main(model=None, output_dir=None, n_iter=10):
|
def main(model=None, output_dir=None, n_iter=10):
|
||||||
"""Load the model, set up the pipeline and train the parser."""
|
"""Load the model, set up the pipeline and train the parser."""
|
||||||
if model is not None:
|
if model is not None:
|
||||||
nlp = spacy.load(model) # load existing spaCy model
|
nlp = spacy.load(model) # load existing spaCy model
|
||||||
print("Loaded model '%s'" % model)
|
print("Loaded model '%s'" % model)
|
||||||
else:
|
else:
|
||||||
nlp = spacy.blank('en') # create blank Language class
|
nlp = spacy.blank("en") # create blank Language class
|
||||||
print("Created blank 'en' model")
|
print("Created blank 'en' model")
|
||||||
|
|
||||||
# add the parser to the pipeline if it doesn't exist
|
# add the parser to the pipeline if it doesn't exist
|
||||||
# nlp.create_pipe works for built-ins that are registered with spaCy
|
# nlp.create_pipe works for built-ins that are registered with spaCy
|
||||||
if 'parser' not in nlp.pipe_names:
|
if "parser" not in nlp.pipe_names:
|
||||||
parser = nlp.create_pipe('parser')
|
parser = nlp.create_pipe("parser")
|
||||||
nlp.add_pipe(parser, first=True)
|
nlp.add_pipe(parser, first=True)
|
||||||
# otherwise, get it, so we can add labels to it
|
# otherwise, get it, so we can add labels to it
|
||||||
else:
|
else:
|
||||||
parser = nlp.get_pipe('parser')
|
parser = nlp.get_pipe("parser")
|
||||||
|
|
||||||
# add labels to the parser
|
# add labels to the parser
|
||||||
for _, annotations in TRAIN_DATA:
|
for _, annotations in TRAIN_DATA:
|
||||||
for dep in annotations.get('deps', []):
|
for dep in annotations.get("deps", []):
|
||||||
parser.add_label(dep)
|
parser.add_label(dep)
|
||||||
|
|
||||||
# get names of other pipes to disable them during training
|
# get names of other pipes to disable them during training
|
||||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser']
|
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "parser"]
|
||||||
with nlp.disable_pipes(*other_pipes): # only train parser
|
with nlp.disable_pipes(*other_pipes): # only train parser
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.begin_training()
|
||||||
for itn in range(n_iter):
|
for itn in range(n_iter):
|
||||||
random.shuffle(TRAIN_DATA)
|
random.shuffle(TRAIN_DATA)
|
||||||
losses = {}
|
losses = {}
|
||||||
# batch up the examples using spaCy's minibatch
|
# batch up the examples using spaCy's minibatch
|
||||||
batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
|
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
texts, annotations = zip(*batch)
|
texts, annotations = zip(*batch)
|
||||||
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
|
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
|
||||||
print('Losses', losses)
|
print("Losses", losses)
|
||||||
|
|
||||||
# test the trained model
|
# test the trained model
|
||||||
test_text = "I like securities."
|
test_text = "I like securities."
|
||||||
doc = nlp(test_text)
|
doc = nlp(test_text)
|
||||||
print('Dependencies', [(t.text, t.dep_, t.head.text) for t in doc])
|
print("Dependencies", [(t.text, t.dep_, t.head.text) for t in doc])
|
||||||
|
|
||||||
# save model to output directory
|
# save model to output directory
|
||||||
if output_dir is not None:
|
if output_dir is not None:
|
||||||
|
@ -87,10 +94,10 @@ def main(model=None, output_dir=None, n_iter=10):
|
||||||
print("Loading from", output_dir)
|
print("Loading from", output_dir)
|
||||||
nlp2 = spacy.load(output_dir)
|
nlp2 = spacy.load(output_dir)
|
||||||
doc = nlp2(test_text)
|
doc = nlp2(test_text)
|
||||||
print('Dependencies', [(t.text, t.dep_, t.head.text) for t in doc])
|
print("Dependencies", [(t.text, t.dep_, t.head.text) for t in doc])
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
plac.call(main)
|
plac.call(main)
|
||||||
|
|
||||||
# expected result:
|
# expected result:
|
||||||
|
|
|
@ -25,11 +25,7 @@ from spacy.util import minibatch, compounding
|
||||||
# http://universaldependencies.github.io/docs/u/pos/index.html
|
# http://universaldependencies.github.io/docs/u/pos/index.html
|
||||||
# You may also specify morphological features for your tags, from the universal
|
# You may also specify morphological features for your tags, from the universal
|
||||||
# scheme.
|
# scheme.
|
||||||
TAG_MAP = {
|
TAG_MAP = {"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}, "J": {"pos": "ADJ"}}
|
||||||
'N': {'pos': 'NOUN'},
|
|
||||||
'V': {'pos': 'VERB'},
|
|
||||||
'J': {'pos': 'ADJ'}
|
|
||||||
}
|
|
||||||
|
|
||||||
# Usually you'll read this in, of course. Data formats vary. Ensure your
|
# Usually you'll read this in, of course. Data formats vary. Ensure your
|
||||||
# strings are unicode and that the number of tags assigned matches spaCy's
|
# strings are unicode and that the number of tags assigned matches spaCy's
|
||||||
|
@ -37,16 +33,17 @@ TAG_MAP = {
|
||||||
# that specifies the gold-standard tokenization, e.g.:
|
# that specifies the gold-standard tokenization, e.g.:
|
||||||
# ("Eatblueham", {'words': ['Eat', 'blue', 'ham'] 'tags': ['V', 'J', 'N']})
|
# ("Eatblueham", {'words': ['Eat', 'blue', 'ham'] 'tags': ['V', 'J', 'N']})
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
("I like green eggs", {'tags': ['N', 'V', 'J', 'N']}),
|
("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
|
||||||
("Eat blue ham", {'tags': ['V', 'J', 'N']})
|
("Eat blue ham", {"tags": ["V", "J", "N"]}),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
lang=("ISO Code of language to use", "option", "l", str),
|
lang=("ISO Code of language to use", "option", "l", str),
|
||||||
output_dir=("Optional output directory", "option", "o", Path),
|
output_dir=("Optional output directory", "option", "o", Path),
|
||||||
n_iter=("Number of training iterations", "option", "n", int))
|
n_iter=("Number of training iterations", "option", "n", int),
|
||||||
def main(lang='en', output_dir=None, n_iter=25):
|
)
|
||||||
|
def main(lang="en", output_dir=None, n_iter=25):
|
||||||
"""Create a new model, set up the pipeline and train the tagger. In order to
|
"""Create a new model, set up the pipeline and train the tagger. In order to
|
||||||
train the tagger with a custom tag map, we're creating a new Language
|
train the tagger with a custom tag map, we're creating a new Language
|
||||||
instance with a custom vocab.
|
instance with a custom vocab.
|
||||||
|
@ -54,7 +51,7 @@ def main(lang='en', output_dir=None, n_iter=25):
|
||||||
nlp = spacy.blank(lang)
|
nlp = spacy.blank(lang)
|
||||||
# add the tagger to the pipeline
|
# add the tagger to the pipeline
|
||||||
# nlp.create_pipe works for built-ins that are registered with spaCy
|
# nlp.create_pipe works for built-ins that are registered with spaCy
|
||||||
tagger = nlp.create_pipe('tagger')
|
tagger = nlp.create_pipe("tagger")
|
||||||
# Add the tags. This needs to be done before you start training.
|
# Add the tags. This needs to be done before you start training.
|
||||||
for tag, values in TAG_MAP.items():
|
for tag, values in TAG_MAP.items():
|
||||||
tagger.add_label(tag, values)
|
tagger.add_label(tag, values)
|
||||||
|
@ -65,16 +62,16 @@ def main(lang='en', output_dir=None, n_iter=25):
|
||||||
random.shuffle(TRAIN_DATA)
|
random.shuffle(TRAIN_DATA)
|
||||||
losses = {}
|
losses = {}
|
||||||
# batch up the examples using spaCy's minibatch
|
# batch up the examples using spaCy's minibatch
|
||||||
batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
|
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
texts, annotations = zip(*batch)
|
texts, annotations = zip(*batch)
|
||||||
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
|
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
|
||||||
print('Losses', losses)
|
print("Losses", losses)
|
||||||
|
|
||||||
# test the trained model
|
# test the trained model
|
||||||
test_text = "I like blue eggs"
|
test_text = "I like blue eggs"
|
||||||
doc = nlp(test_text)
|
doc = nlp(test_text)
|
||||||
print('Tags', [(t.text, t.tag_, t.pos_) for t in doc])
|
print("Tags", [(t.text, t.tag_, t.pos_) for t in doc])
|
||||||
|
|
||||||
# save model to output directory
|
# save model to output directory
|
||||||
if output_dir is not None:
|
if output_dir is not None:
|
||||||
|
@ -88,10 +85,10 @@ def main(lang='en', output_dir=None, n_iter=25):
|
||||||
print("Loading from", output_dir)
|
print("Loading from", output_dir)
|
||||||
nlp2 = spacy.load(output_dir)
|
nlp2 = spacy.load(output_dir)
|
||||||
doc = nlp2(test_text)
|
doc = nlp2(test_text)
|
||||||
print('Tags', [(t.text, t.tag_, t.pos_) for t in doc])
|
print("Tags", [(t.text, t.tag_, t.pos_) for t in doc])
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
plac.call(main)
|
plac.call(main)
|
||||||
|
|
||||||
# Expected output:
|
# Expected output:
|
||||||
|
|
|
@ -23,55 +23,62 @@ from spacy.util import minibatch, compounding
|
||||||
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
|
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
|
||||||
output_dir=("Optional output directory", "option", "o", Path),
|
output_dir=("Optional output directory", "option", "o", Path),
|
||||||
n_texts=("Number of texts to train from", "option", "t", int),
|
n_texts=("Number of texts to train from", "option", "t", int),
|
||||||
n_iter=("Number of training iterations", "option", "n", int))
|
n_iter=("Number of training iterations", "option", "n", int),
|
||||||
|
)
|
||||||
def main(model=None, output_dir=None, n_iter=20, n_texts=2000):
|
def main(model=None, output_dir=None, n_iter=20, n_texts=2000):
|
||||||
if model is not None:
|
if model is not None:
|
||||||
nlp = spacy.load(model) # load existing spaCy model
|
nlp = spacy.load(model) # load existing spaCy model
|
||||||
print("Loaded model '%s'" % model)
|
print("Loaded model '%s'" % model)
|
||||||
else:
|
else:
|
||||||
nlp = spacy.blank('en') # create blank Language class
|
nlp = spacy.blank("en") # create blank Language class
|
||||||
print("Created blank 'en' model")
|
print("Created blank 'en' model")
|
||||||
|
|
||||||
# add the text classifier to the pipeline if it doesn't exist
|
# add the text classifier to the pipeline if it doesn't exist
|
||||||
# nlp.create_pipe works for built-ins that are registered with spaCy
|
# nlp.create_pipe works for built-ins that are registered with spaCy
|
||||||
if 'textcat' not in nlp.pipe_names:
|
if "textcat" not in nlp.pipe_names:
|
||||||
textcat = nlp.create_pipe('textcat')
|
textcat = nlp.create_pipe("textcat")
|
||||||
nlp.add_pipe(textcat, last=True)
|
nlp.add_pipe(textcat, last=True)
|
||||||
# otherwise, get it, so we can add labels to it
|
# otherwise, get it, so we can add labels to it
|
||||||
else:
|
else:
|
||||||
textcat = nlp.get_pipe('textcat')
|
textcat = nlp.get_pipe("textcat")
|
||||||
|
|
||||||
# add label to text classifier
|
# add label to text classifier
|
||||||
textcat.add_label('POSITIVE')
|
textcat.add_label("POSITIVE")
|
||||||
|
|
||||||
# load the IMDB dataset
|
# load the IMDB dataset
|
||||||
print("Loading IMDB data...")
|
print("Loading IMDB data...")
|
||||||
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)
|
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)
|
||||||
print("Using {} examples ({} training, {} evaluation)"
|
print(
|
||||||
.format(n_texts, len(train_texts), len(dev_texts)))
|
"Using {} examples ({} training, {} evaluation)".format(
|
||||||
train_data = list(zip(train_texts,
|
n_texts, len(train_texts), len(dev_texts)
|
||||||
[{'cats': cats} for cats in train_cats]))
|
)
|
||||||
|
)
|
||||||
|
train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
|
||||||
|
|
||||||
# get names of other pipes to disable them during training
|
# get names of other pipes to disable them during training
|
||||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
|
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
|
||||||
with nlp.disable_pipes(*other_pipes): # only train textcat
|
with nlp.disable_pipes(*other_pipes): # only train textcat
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.begin_training()
|
||||||
print("Training the model...")
|
print("Training the model...")
|
||||||
print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
|
print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
|
||||||
for i in range(n_iter):
|
for i in range(n_iter):
|
||||||
losses = {}
|
losses = {}
|
||||||
# batch up the examples using spaCy's minibatch
|
# batch up the examples using spaCy's minibatch
|
||||||
batches = minibatch(train_data, size=compounding(4., 32., 1.001))
|
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
texts, annotations = zip(*batch)
|
texts, annotations = zip(*batch)
|
||||||
nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
|
nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
|
||||||
losses=losses)
|
|
||||||
with textcat.model.use_params(optimizer.averages):
|
with textcat.model.use_params(optimizer.averages):
|
||||||
# evaluate on the dev data split off in load_data()
|
# evaluate on the dev data split off in load_data()
|
||||||
scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
|
scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
|
||||||
print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}' # print a simple table
|
print(
|
||||||
.format(losses['textcat'], scores['textcat_p'],
|
"{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format( # print a simple table
|
||||||
scores['textcat_r'], scores['textcat_f']))
|
losses["textcat"],
|
||||||
|
scores["textcat_p"],
|
||||||
|
scores["textcat_r"],
|
||||||
|
scores["textcat_f"],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# test the trained model
|
# test the trained model
|
||||||
test_text = "This movie sucked"
|
test_text = "This movie sucked"
|
||||||
|
@ -99,7 +106,7 @@ def load_data(limit=0, split=0.8):
|
||||||
random.shuffle(train_data)
|
random.shuffle(train_data)
|
||||||
train_data = train_data[-limit:]
|
train_data = train_data[-limit:]
|
||||||
texts, labels = zip(*train_data)
|
texts, labels = zip(*train_data)
|
||||||
cats = [{'POSITIVE': bool(y)} for y in labels]
|
cats = [{"POSITIVE": bool(y)} for y in labels]
|
||||||
split = int(len(train_data) * split)
|
split = int(len(train_data) * split)
|
||||||
return (texts[:split], cats[:split]), (texts[split:], cats[split:])
|
return (texts[:split], cats[:split]), (texts[split:], cats[split:])
|
||||||
|
|
||||||
|
@ -116,9 +123,9 @@ def evaluate(tokenizer, textcat, texts, cats):
|
||||||
if label not in gold:
|
if label not in gold:
|
||||||
continue
|
continue
|
||||||
if score >= 0.5 and gold[label] >= 0.5:
|
if score >= 0.5 and gold[label] >= 0.5:
|
||||||
tp += 1.
|
tp += 1.0
|
||||||
elif score >= 0.5 and gold[label] < 0.5:
|
elif score >= 0.5 and gold[label] < 0.5:
|
||||||
fp += 1.
|
fp += 1.0
|
||||||
elif score < 0.5 and gold[label] < 0.5:
|
elif score < 0.5 and gold[label] < 0.5:
|
||||||
tn += 1
|
tn += 1
|
||||||
elif score < 0.5 and gold[label] >= 0.5:
|
elif score < 0.5 and gold[label] >= 0.5:
|
||||||
|
@ -126,8 +133,8 @@ def evaluate(tokenizer, textcat, texts, cats):
|
||||||
precision = tp / (tp + fp)
|
precision = tp / (tp + fp)
|
||||||
recall = tp / (tp + fn)
|
recall = tp / (tp + fn)
|
||||||
f_score = 2 * (precision * recall) / (precision + recall)
|
f_score = 2 * (precision * recall) / (precision + recall)
|
||||||
return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}
|
return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
plac.call(main)
|
plac.call(main)
|
||||||
|
|
|
@ -14,8 +14,13 @@ from spacy.language import Language
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
vectors_loc=("Path to .vec file", "positional", None, str),
|
vectors_loc=("Path to .vec file", "positional", None, str),
|
||||||
lang=("Optional language ID. If not set, blank Language() will be used.",
|
lang=(
|
||||||
"positional", None, str))
|
"Optional language ID. If not set, blank Language() will be used.",
|
||||||
|
"positional",
|
||||||
|
None,
|
||||||
|
str,
|
||||||
|
),
|
||||||
|
)
|
||||||
def main(vectors_loc, lang=None):
|
def main(vectors_loc, lang=None):
|
||||||
if lang is None:
|
if lang is None:
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
|
@ -24,21 +29,21 @@ def main(vectors_loc, lang=None):
|
||||||
# save the model to disk and load it back later (models always need a
|
# save the model to disk and load it back later (models always need a
|
||||||
# "lang" setting). Use 'xx' for blank multi-language class.
|
# "lang" setting). Use 'xx' for blank multi-language class.
|
||||||
nlp = spacy.blank(lang)
|
nlp = spacy.blank(lang)
|
||||||
with open(vectors_loc, 'rb') as file_:
|
with open(vectors_loc, "rb") as file_:
|
||||||
header = file_.readline()
|
header = file_.readline()
|
||||||
nr_row, nr_dim = header.split()
|
nr_row, nr_dim = header.split()
|
||||||
nlp.vocab.reset_vectors(width=int(nr_dim))
|
nlp.vocab.reset_vectors(width=int(nr_dim))
|
||||||
for line in file_:
|
for line in file_:
|
||||||
line = line.rstrip().decode('utf8')
|
line = line.rstrip().decode("utf8")
|
||||||
pieces = line.rsplit(' ', int(nr_dim))
|
pieces = line.rsplit(" ", int(nr_dim))
|
||||||
word = pieces[0]
|
word = pieces[0]
|
||||||
vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
|
vector = numpy.asarray([float(v) for v in pieces[1:]], dtype="f")
|
||||||
nlp.vocab.set_vector(word, vector) # add the vectors to the vocab
|
nlp.vocab.set_vector(word, vector) # add the vectors to the vocab
|
||||||
# test the vectors and similarity
|
# test the vectors and similarity
|
||||||
text = 'class colspan'
|
text = "class colspan"
|
||||||
doc = nlp(text)
|
doc = nlp(text)
|
||||||
print(text, doc[0].similarity(doc[1]))
|
print(text, doc[0].similarity(doc[1]))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
plac.call(main)
|
plac.call(main)
|
||||||
|
|
|
@ -14,26 +14,45 @@ import plac
|
||||||
import spacy
|
import spacy
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
import tqdm
|
import tqdm
|
||||||
from tensorflow.contrib.tensorboard.plugins.projector import visualize_embeddings, ProjectorConfig
|
from tensorflow.contrib.tensorboard.plugins.projector import (
|
||||||
|
visualize_embeddings,
|
||||||
|
ProjectorConfig,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
vectors_loc=("Path to spaCy model that contains vectors", "positional", None, str),
|
vectors_loc=("Path to spaCy model that contains vectors", "positional", None, str),
|
||||||
out_loc=("Path to output folder for tensorboard session data", "positional", None, str),
|
out_loc=(
|
||||||
name=("Human readable name for tsv file and vectors tensor", "positional", None, str),
|
"Path to output folder for tensorboard session data",
|
||||||
|
"positional",
|
||||||
|
None,
|
||||||
|
str,
|
||||||
|
),
|
||||||
|
name=(
|
||||||
|
"Human readable name for tsv file and vectors tensor",
|
||||||
|
"positional",
|
||||||
|
None,
|
||||||
|
str,
|
||||||
|
),
|
||||||
)
|
)
|
||||||
def main(vectors_loc, out_loc, name="spaCy_vectors"):
|
def main(vectors_loc, out_loc, name="spaCy_vectors"):
|
||||||
meta_file = "{}.tsv".format(name)
|
meta_file = "{}.tsv".format(name)
|
||||||
out_meta_file = path.join(out_loc, meta_file)
|
out_meta_file = path.join(out_loc, meta_file)
|
||||||
|
|
||||||
print('Loading spaCy vectors model: {}'.format(vectors_loc))
|
print("Loading spaCy vectors model: {}".format(vectors_loc))
|
||||||
model = spacy.load(vectors_loc)
|
model = spacy.load(vectors_loc)
|
||||||
print('Finding lexemes with vectors attached: {}'.format(vectors_loc))
|
print("Finding lexemes with vectors attached: {}".format(vectors_loc))
|
||||||
strings_stream = tqdm.tqdm(model.vocab.strings, total=len(model.vocab.strings), leave=False)
|
strings_stream = tqdm.tqdm(
|
||||||
|
model.vocab.strings, total=len(model.vocab.strings), leave=False
|
||||||
|
)
|
||||||
queries = [w for w in strings_stream if model.vocab.has_vector(w)]
|
queries = [w for w in strings_stream if model.vocab.has_vector(w)]
|
||||||
vector_count = len(queries)
|
vector_count = len(queries)
|
||||||
|
|
||||||
print('Building Tensorboard Projector metadata for ({}) vectors: {}'.format(vector_count, out_meta_file))
|
print(
|
||||||
|
"Building Tensorboard Projector metadata for ({}) vectors: {}".format(
|
||||||
|
vector_count, out_meta_file
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# Store vector data in a tensorflow variable
|
# Store vector data in a tensorflow variable
|
||||||
tf_vectors_variable = numpy.zeros((vector_count, model.vocab.vectors.shape[1]))
|
tf_vectors_variable = numpy.zeros((vector_count, model.vocab.vectors.shape[1]))
|
||||||
|
@ -41,22 +60,26 @@ def main(vectors_loc, out_loc, name="spaCy_vectors"):
|
||||||
# Write a tab-separated file that contains information about the vectors for visualization
|
# Write a tab-separated file that contains information about the vectors for visualization
|
||||||
#
|
#
|
||||||
# Reference: https://www.tensorflow.org/programmers_guide/embedding#metadata
|
# Reference: https://www.tensorflow.org/programmers_guide/embedding#metadata
|
||||||
with open(out_meta_file, 'wb') as file_metadata:
|
with open(out_meta_file, "wb") as file_metadata:
|
||||||
# Define columns in the first row
|
# Define columns in the first row
|
||||||
file_metadata.write("Text\tFrequency\n".encode('utf-8'))
|
file_metadata.write("Text\tFrequency\n".encode("utf-8"))
|
||||||
# Write out a row for each vector that we add to the tensorflow variable we created
|
# Write out a row for each vector that we add to the tensorflow variable we created
|
||||||
vec_index = 0
|
vec_index = 0
|
||||||
for text in tqdm.tqdm(queries, total=len(queries), leave=False):
|
for text in tqdm.tqdm(queries, total=len(queries), leave=False):
|
||||||
# https://github.com/tensorflow/tensorflow/issues/9094
|
# https://github.com/tensorflow/tensorflow/issues/9094
|
||||||
text = '<Space>' if text.lstrip() == '' else text
|
text = "<Space>" if text.lstrip() == "" else text
|
||||||
lex = model.vocab[text]
|
lex = model.vocab[text]
|
||||||
|
|
||||||
# Store vector data and metadata
|
# Store vector data and metadata
|
||||||
tf_vectors_variable[vec_index] = model.vocab.get_vector(text)
|
tf_vectors_variable[vec_index] = model.vocab.get_vector(text)
|
||||||
file_metadata.write("{}\t{}\n".format(text, math.exp(lex.prob) * vector_count).encode('utf-8'))
|
file_metadata.write(
|
||||||
|
"{}\t{}\n".format(text, math.exp(lex.prob) * vector_count).encode(
|
||||||
|
"utf-8"
|
||||||
|
)
|
||||||
|
)
|
||||||
vec_index += 1
|
vec_index += 1
|
||||||
|
|
||||||
print('Running Tensorflow Session...')
|
print("Running Tensorflow Session...")
|
||||||
sess = tf.InteractiveSession()
|
sess = tf.InteractiveSession()
|
||||||
tf.Variable(tf_vectors_variable, trainable=False, name=name)
|
tf.Variable(tf_vectors_variable, trainable=False, name=name)
|
||||||
tf.global_variables_initializer().run()
|
tf.global_variables_initializer().run()
|
||||||
|
@ -73,10 +96,10 @@ def main(vectors_loc, out_loc, name="spaCy_vectors"):
|
||||||
visualize_embeddings(writer, config)
|
visualize_embeddings(writer, config)
|
||||||
|
|
||||||
# Save session and print run command to the output
|
# Save session and print run command to the output
|
||||||
print('Saving Tensorboard Session...')
|
print("Saving Tensorboard Session...")
|
||||||
saver.save(sess, path.join(out_loc, '{}.ckpt'.format(name)))
|
saver.save(sess, path.join(out_loc, "{}.ckpt".format(name)))
|
||||||
print('Done. Run `tensorboard --logdir={0}` to view in Tensorboard'.format(out_loc))
|
print("Done. Run `tensorboard --logdir={0}` to view in Tensorboard".format(out_loc))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
plac.call(main)
|
plac.call(main)
|
||||||
|
|
|
@ -1,17 +1,17 @@
|
||||||
numpy>=1.15.0
|
# Our libraries
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=2.0.1,<2.1.0
|
preshed>=2.0.1,<2.1.0
|
||||||
thinc==7.0.0.dev3
|
thinc==7.0.0.dev6
|
||||||
blis>=0.2.2,<0.3.0
|
blis>=0.2.2,<0.3.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cytoolz>=0.9.0,<0.10.0
|
wasabi>=0.0.8,<1.1.0
|
||||||
plac<1.0.0,>=0.9.6
|
srsly>=0.0.5,<1.1.0
|
||||||
ujson>=1.35
|
# Third party dependencies
|
||||||
dill>=0.2,<0.3
|
numpy>=1.15.0
|
||||||
regex==2018.01.10
|
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
jsonschema>=2.6.0,<3.0.0
|
jsonschema>=2.6.0,<3.0.0
|
||||||
wasabi>=0.0.8,<1.1.0
|
regex==2018.01.10
|
||||||
|
plac<1.0.0,>=0.9.6
|
||||||
pathlib==1.0.1; python_version < "3.4"
|
pathlib==1.0.1; python_version < "3.4"
|
||||||
# Development dependencies
|
# Development dependencies
|
||||||
cython>=0.25
|
cython>=0.25
|
||||||
|
|
35
setup.py
35
setup.py
|
@ -7,10 +7,27 @@ import sys
|
||||||
import contextlib
|
import contextlib
|
||||||
from distutils.command.build_ext import build_ext
|
from distutils.command.build_ext import build_ext
|
||||||
from distutils.sysconfig import get_python_inc
|
from distutils.sysconfig import get_python_inc
|
||||||
|
import distutils.util
|
||||||
from distutils import ccompiler, msvccompiler
|
from distutils import ccompiler, msvccompiler
|
||||||
from setuptools import Extension, setup, find_packages
|
from setuptools import Extension, setup, find_packages
|
||||||
|
|
||||||
|
|
||||||
|
def is_new_osx():
|
||||||
|
'''Check whether we're on OSX >= 10.10'''
|
||||||
|
name = distutils.util.get_platform()
|
||||||
|
if sys.platform != 'darwin':
|
||||||
|
return False
|
||||||
|
elif name.startswith('macosx-10'):
|
||||||
|
minor_version = int(name.split('-')[1].split('.')[1])
|
||||||
|
if minor_version >= 7:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
PACKAGE_DATA = {"": ["*.pyx", "*.pxd", "*.txt", "*.tokens"]}
|
PACKAGE_DATA = {"": ["*.pyx", "*.pxd", "*.txt", "*.tokens"]}
|
||||||
|
|
||||||
|
|
||||||
|
@ -57,8 +74,17 @@ COMPILE_OPTIONS = {
|
||||||
LINK_OPTIONS = {"msvc": [], "mingw32": [], "other": []}
|
LINK_OPTIONS = {"msvc": [], "mingw32": [], "other": []}
|
||||||
|
|
||||||
|
|
||||||
# I don't understand this very well yet. See Issue #267
|
if is_new_osx():
|
||||||
# Fingers crossed!
|
# On Mac, use libc++ because Apple deprecated use of
|
||||||
|
# libstdc
|
||||||
|
COMPILE_OPTIONS["other"].append("-stdlib=libc++")
|
||||||
|
LINK_OPTIONS["other"].append("-lc++")
|
||||||
|
# g++ (used by unix compiler on mac) links to libstdc++ as a default lib.
|
||||||
|
# See: https://stackoverflow.com/questions/1653047/avoid-linking-to-libstdc
|
||||||
|
LINK_OPTIONS["other"].append("-nodefaultlibs")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
USE_OPENMP_DEFAULT = "0" if sys.platform != "darwin" else None
|
USE_OPENMP_DEFAULT = "0" if sys.platform != "darwin" else None
|
||||||
if os.environ.get("USE_OPENMP", USE_OPENMP_DEFAULT) == "1":
|
if os.environ.get("USE_OPENMP", USE_OPENMP_DEFAULT) == "1":
|
||||||
if sys.platform == "darwin":
|
if sys.platform == "darwin":
|
||||||
|
@ -200,15 +226,14 @@ def setup_package():
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"cymem>=2.0.2,<2.1.0",
|
"cymem>=2.0.2,<2.1.0",
|
||||||
"preshed>=2.0.1,<2.1.0",
|
"preshed>=2.0.1,<2.1.0",
|
||||||
"thinc==7.0.0.dev3",
|
"thinc==7.0.0.dev6",
|
||||||
"blis>=0.2.2,<0.3.0",
|
"blis>=0.2.2,<0.3.0",
|
||||||
"plac<1.0.0,>=0.9.6",
|
"plac<1.0.0,>=0.9.6",
|
||||||
"ujson>=1.35",
|
|
||||||
"regex==2018.01.10",
|
"regex==2018.01.10",
|
||||||
"dill>=0.2,<0.3",
|
|
||||||
"requests>=2.13.0,<3.0.0",
|
"requests>=2.13.0,<3.0.0",
|
||||||
"jsonschema>=2.6.0,<3.0.0",
|
"jsonschema>=2.6.0,<3.0.0",
|
||||||
"wasabi>=0.0.8,<1.1.0",
|
"wasabi>=0.0.8,<1.1.0",
|
||||||
|
"srsly>=0.0.5,<1.1.0",
|
||||||
'pathlib==1.0.1; python_version < "3.4"',
|
'pathlib==1.0.1; python_version < "3.4"',
|
||||||
],
|
],
|
||||||
setup_requires=["wheel"],
|
setup_requires=["wheel"],
|
||||||
|
|
|
@ -8,8 +8,9 @@ from thinc.t2t import ExtractWindow, ParametricAttention
|
||||||
from thinc.t2v import Pooling, sum_pool
|
from thinc.t2v import Pooling, sum_pool
|
||||||
from thinc.misc import Residual
|
from thinc.misc import Residual
|
||||||
from thinc.misc import LayerNorm as LN
|
from thinc.misc import LayerNorm as LN
|
||||||
|
from thinc.misc import FeatureExtracter
|
||||||
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
|
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
|
||||||
from thinc.api import FeatureExtracter, with_getitem, flatten_add_lengths
|
from thinc.api import with_getitem, flatten_add_lengths
|
||||||
from thinc.api import uniqued, wrap, noop
|
from thinc.api import uniqued, wrap, noop
|
||||||
from thinc.api import with_square_sequences
|
from thinc.api import with_square_sequences
|
||||||
from thinc.linear.linear import LinearModel
|
from thinc.linear.linear import LinearModel
|
||||||
|
|
|
@ -10,7 +10,7 @@ __uri__ = "https://spacy.io"
|
||||||
__author__ = "Explosion AI"
|
__author__ = "Explosion AI"
|
||||||
__email__ = "contact@explosion.ai"
|
__email__ = "contact@explosion.ai"
|
||||||
__license__ = "MIT"
|
__license__ = "MIT"
|
||||||
__release__ = False
|
__release__ = True
|
||||||
|
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
|
|
|
@ -4,9 +4,9 @@ from __future__ import unicode_literals
|
||||||
import plac
|
import plac
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
import srsly
|
||||||
|
|
||||||
from ..util import write_jsonl, write_json
|
from ..compat import path2str
|
||||||
from ..compat import json_dumps, path2str
|
|
||||||
from .converters import conllu2json, conllubio2json, iob2json, conll_ner2json
|
from .converters import conllu2json, conllubio2json, iob2json, conll_ner2json
|
||||||
from .converters import ner_jsonl2json
|
from .converters import ner_jsonl2json
|
||||||
from ._messages import Messages
|
from ._messages import Messages
|
||||||
|
@ -77,9 +77,9 @@ def convert(
|
||||||
suffix = ".{}".format(file_type)
|
suffix = ".{}".format(file_type)
|
||||||
output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
|
output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
|
||||||
if file_type == "json":
|
if file_type == "json":
|
||||||
write_json(output_file, data)
|
srsly.write_json(output_file, data)
|
||||||
elif file_type == "jsonl":
|
elif file_type == "jsonl":
|
||||||
write_jsonl(output_file, data)
|
srsly.write_jsonl(output_file, data)
|
||||||
msg.good(
|
msg.good(
|
||||||
Messages.M032.format(name=path2str(output_file)),
|
Messages.M032.format(name=path2str(output_file)),
|
||||||
Messages.M033.format(n_docs=len(data)),
|
Messages.M033.format(n_docs=len(data)),
|
||||||
|
@ -87,7 +87,6 @@ def convert(
|
||||||
else:
|
else:
|
||||||
# Print to stdout
|
# Print to stdout
|
||||||
if file_type == "json":
|
if file_type == "json":
|
||||||
print(json_dumps(data))
|
srsly.write_json("-", data)
|
||||||
elif file_type == "jsonl":
|
elif file_type == "jsonl":
|
||||||
for line in data:
|
srsly.write_jsonl("-", data)
|
||||||
print(json_dumps(line))
|
|
||||||
|
|
|
@ -1,9 +1,8 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from cytoolz import partition_all
|
|
||||||
|
|
||||||
from ...gold import iob_to_biluo
|
from ...gold import iob_to_biluo
|
||||||
|
from ...util import minibatch
|
||||||
|
|
||||||
|
|
||||||
def iob2json(input_data, n_sents=10, *args, **kwargs):
|
def iob2json(input_data, n_sents=10, *args, **kwargs):
|
||||||
|
@ -11,7 +10,7 @@ def iob2json(input_data, n_sents=10, *args, **kwargs):
|
||||||
Convert IOB files into JSON format for use with train cli.
|
Convert IOB files into JSON format for use with train cli.
|
||||||
"""
|
"""
|
||||||
docs = []
|
docs = []
|
||||||
for group in partition_all(n_sents, docs):
|
for group in minibatch(docs, n_sents):
|
||||||
group = list(group)
|
group = list(group)
|
||||||
first = group.pop(0)
|
first = group.pop(0)
|
||||||
to_extend = first["paragraphs"][0]["sentences"]
|
to_extend = first["paragraphs"][0]["sentences"]
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import ujson
|
import srsly
|
||||||
|
|
||||||
from ...util import get_lang_class
|
from ...util import get_lang_class
|
||||||
from .._messages import Messages
|
from .._messages import Messages
|
||||||
|
@ -11,7 +11,7 @@ def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False):
|
||||||
if lang is None:
|
if lang is None:
|
||||||
raise ValueError(Messages.M054)
|
raise ValueError(Messages.M054)
|
||||||
json_docs = []
|
json_docs = []
|
||||||
input_tuples = [ujson.loads(line) for line in input_data]
|
input_tuples = [srsly.json_loads(line) for line in input_data]
|
||||||
nlp = get_lang_class(lang)()
|
nlp = get_lang_class(lang)()
|
||||||
for i, (raw_text, ents) in enumerate(input_tuples):
|
for i, (raw_text, ents) in enumerate(input_tuples):
|
||||||
doc = nlp.make_doc(raw_text)
|
doc = nlp.make_doc(raw_text)
|
||||||
|
|
|
@ -5,10 +5,11 @@ from pathlib import Path
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
import plac
|
import plac
|
||||||
import sys
|
import sys
|
||||||
|
import srsly
|
||||||
from wasabi import Printer, MESSAGES
|
from wasabi import Printer, MESSAGES
|
||||||
|
|
||||||
from ..gold import GoldCorpus, read_json_object
|
from ..gold import GoldCorpus, read_json_object
|
||||||
from ..util import load_model, get_lang_class, read_json, read_jsonl
|
from ..util import load_model, get_lang_class
|
||||||
|
|
||||||
# from .schemas import get_schema, validate_json
|
# from .schemas import get_schema, validate_json
|
||||||
from ._messages import Messages
|
from ._messages import Messages
|
||||||
|
@ -320,11 +321,11 @@ def debug_data(
|
||||||
def _load_file(file_path, msg):
|
def _load_file(file_path, msg):
|
||||||
file_name = file_path.parts[-1]
|
file_name = file_path.parts[-1]
|
||||||
if file_path.suffix == ".json":
|
if file_path.suffix == ".json":
|
||||||
data = read_json(file_path)
|
data = srsly.read_json(file_path)
|
||||||
msg.good("Loaded {}".format(file_name))
|
msg.good("Loaded {}".format(file_name))
|
||||||
return data
|
return data
|
||||||
elif file_path.suffix == ".jsonl":
|
elif file_path.suffix == ".jsonl":
|
||||||
data = read_jsonl(file_path)
|
data = srsly.read_jsonl(file_path)
|
||||||
msg.good("Loaded {}".format(file_name))
|
msg.good("Loaded {}".format(file_name))
|
||||||
return data
|
return data
|
||||||
msg.fail(
|
msg.fail(
|
||||||
|
|
|
@ -5,6 +5,7 @@ import plac
|
||||||
import platform
|
import platform
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
import srsly
|
||||||
|
|
||||||
from ._messages import Messages
|
from ._messages import Messages
|
||||||
from ..compat import path2str, basestring_, unicode_
|
from ..compat import path2str, basestring_, unicode_
|
||||||
|
@ -32,7 +33,7 @@ def info(model=None, markdown=False, silent=False):
|
||||||
meta_path = model_path / "meta.json"
|
meta_path = model_path / "meta.json"
|
||||||
if not meta_path.is_file():
|
if not meta_path.is_file():
|
||||||
msg.fail(Messages.M020, meta_path, exits=1)
|
msg.fail(Messages.M020, meta_path, exits=1)
|
||||||
meta = util.read_json(meta_path)
|
meta = srsly.read_json(meta_path)
|
||||||
if model_path.resolve() != model_path:
|
if model_path.resolve() != model_path:
|
||||||
meta["link"] = path2str(model_path)
|
meta["link"] = path2str(model_path)
|
||||||
meta["source"] = path2str(model_path.resolve())
|
meta["source"] = path2str(model_path.resolve())
|
||||||
|
|
|
@ -11,12 +11,13 @@ from preshed.counter import PreshCounter
|
||||||
import tarfile
|
import tarfile
|
||||||
import gzip
|
import gzip
|
||||||
import zipfile
|
import zipfile
|
||||||
|
import srsly
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
|
||||||
from ._messages import Messages
|
from ._messages import Messages
|
||||||
from ..vectors import Vectors
|
from ..vectors import Vectors
|
||||||
from ..errors import Errors, Warnings, user_warning
|
from ..errors import Errors, Warnings, user_warning
|
||||||
from ..util import ensure_path, get_lang_class, read_jsonl
|
from ..util import ensure_path, get_lang_class
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import ftfy
|
import ftfy
|
||||||
|
@ -33,7 +34,7 @@ msg = Printer()
|
||||||
freqs_loc=("Location of words frequencies file", "option", "f", Path),
|
freqs_loc=("Location of words frequencies file", "option", "f", Path),
|
||||||
jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path),
|
jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path),
|
||||||
clusters_loc=("Optional location of brown clusters data", "option", "c", str),
|
clusters_loc=("Optional location of brown clusters data", "option", "c", str),
|
||||||
vectors_loc=("Optional vectors file in Word2Vec format" "option", "v", str),
|
vectors_loc=("Optional vectors file in Word2Vec format", "option", "v", str),
|
||||||
prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
|
prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
|
||||||
)
|
)
|
||||||
def init_model(
|
def init_model(
|
||||||
|
@ -59,7 +60,7 @@ def init_model(
|
||||||
settings.append("-c")
|
settings.append("-c")
|
||||||
msg.warn(Messages.M063, Messages.M064)
|
msg.warn(Messages.M063, Messages.M064)
|
||||||
jsonl_loc = ensure_path(jsonl_loc)
|
jsonl_loc = ensure_path(jsonl_loc)
|
||||||
lex_attrs = read_jsonl(jsonl_loc)
|
lex_attrs = srsly.read_jsonl(jsonl_loc)
|
||||||
else:
|
else:
|
||||||
clusters_loc = ensure_path(clusters_loc)
|
clusters_loc = ensure_path(clusters_loc)
|
||||||
freqs_loc = ensure_path(freqs_loc)
|
freqs_loc = ensure_path(freqs_loc)
|
||||||
|
|
|
@ -5,9 +5,10 @@ import plac
|
||||||
import shutil
|
import shutil
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import Printer, get_raw_input
|
from wasabi import Printer, get_raw_input
|
||||||
|
import srsly
|
||||||
|
|
||||||
from ._messages import Messages
|
from ._messages import Messages
|
||||||
from ..compat import path2str, json_dumps
|
from ..compat import path2str
|
||||||
from .. import util
|
from .. import util
|
||||||
from .. import about
|
from .. import about
|
||||||
|
|
||||||
|
@ -40,7 +41,7 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals
|
||||||
|
|
||||||
meta_path = meta_path or input_path / "meta.json"
|
meta_path = meta_path or input_path / "meta.json"
|
||||||
if meta_path.is_file():
|
if meta_path.is_file():
|
||||||
meta = util.read_json(meta_path)
|
meta = srsly.read_json(meta_path)
|
||||||
if not create_meta: # only print if user doesn't want to overwrite
|
if not create_meta: # only print if user doesn't want to overwrite
|
||||||
msg.good(Messages.M041, meta_path)
|
msg.good(Messages.M041, meta_path)
|
||||||
else:
|
else:
|
||||||
|
@ -64,7 +65,7 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals
|
||||||
)
|
)
|
||||||
Path.mkdir(package_path, parents=True)
|
Path.mkdir(package_path, parents=True)
|
||||||
shutil.copytree(path2str(input_path), path2str(package_path / model_name_v))
|
shutil.copytree(path2str(input_path), path2str(package_path / model_name_v))
|
||||||
create_file(main_path / "meta.json", json_dumps(meta))
|
create_file(main_path / "meta.json", srsly.json_dumps(meta))
|
||||||
create_file(main_path / "setup.py", TEMPLATE_SETUP)
|
create_file(main_path / "setup.py", TEMPLATE_SETUP)
|
||||||
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
|
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
|
||||||
create_file(package_path / "__init__.py", TEMPLATE_INIT)
|
create_file(package_path / "__init__.py", TEMPLATE_INIT)
|
||||||
|
|
|
@ -5,8 +5,6 @@ import plac
|
||||||
import random
|
import random
|
||||||
import numpy
|
import numpy
|
||||||
import time
|
import time
|
||||||
import ujson
|
|
||||||
import sys
|
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from thinc.v2v import Affine, Maxout
|
from thinc.v2v import Affine, Maxout
|
||||||
|
@ -14,10 +12,10 @@ from thinc.api import wrap
|
||||||
from thinc.misc import LayerNorm as LN
|
from thinc.misc import LayerNorm as LN
|
||||||
from thinc.neural.util import prefer_gpu
|
from thinc.neural.util import prefer_gpu
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
import srsly
|
||||||
|
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ..attrs import ID, HEAD
|
from ..attrs import ID, HEAD
|
||||||
from ..compat import json_dumps
|
|
||||||
from .._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
|
from .._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
@ -72,7 +70,7 @@ def pretrain(
|
||||||
if not output_dir.exists():
|
if not output_dir.exists():
|
||||||
output_dir.mkdir()
|
output_dir.mkdir()
|
||||||
msg.good("Created output directory")
|
msg.good("Created output directory")
|
||||||
util.write_json(output_dir / "config.json", config)
|
srsly.write_json(output_dir / "config.json", config)
|
||||||
msg.good("Saved settings to config.json")
|
msg.good("Saved settings to config.json")
|
||||||
|
|
||||||
# Load texts from file or stdin
|
# Load texts from file or stdin
|
||||||
|
@ -81,12 +79,12 @@ def pretrain(
|
||||||
if not texts_loc.exists():
|
if not texts_loc.exists():
|
||||||
msg.fail("Input text file doesn't exist", texts_loc, exits=1)
|
msg.fail("Input text file doesn't exist", texts_loc, exits=1)
|
||||||
with msg.loading("Loading input texts..."):
|
with msg.loading("Loading input texts..."):
|
||||||
texts = list(util.read_jsonl(texts_loc))
|
texts = list(srsly.read_jsonl(texts_loc))
|
||||||
msg.good("Loaded input texts")
|
msg.good("Loaded input texts")
|
||||||
random.shuffle(texts)
|
random.shuffle(texts)
|
||||||
else: # reading from stdin
|
else: # reading from stdin
|
||||||
msg.text("Reading input text from stdin...")
|
msg.text("Reading input text from stdin...")
|
||||||
texts = stream_texts()
|
texts = srsly.read_jsonl("-")
|
||||||
|
|
||||||
with msg.loading("Loading model '{}'...".format(vectors_model)):
|
with msg.loading("Loading model '{}'...".format(vectors_model)):
|
||||||
nlp = util.load_model(vectors_model)
|
nlp = util.load_model(vectors_model)
|
||||||
|
@ -130,18 +128,13 @@ def pretrain(
|
||||||
"epoch": epoch,
|
"epoch": epoch,
|
||||||
}
|
}
|
||||||
with (output_dir / "log.jsonl").open("a") as file_:
|
with (output_dir / "log.jsonl").open("a") as file_:
|
||||||
file_.write(json_dumps(log) + "\n")
|
file_.write(srsly.json_dumps(log) + "\n")
|
||||||
tracker.epoch_loss = 0.0
|
tracker.epoch_loss = 0.0
|
||||||
if texts_loc != "-":
|
if texts_loc != "-":
|
||||||
# Reshuffle the texts if texts were loaded from a file
|
# Reshuffle the texts if texts were loaded from a file
|
||||||
random.shuffle(texts)
|
random.shuffle(texts)
|
||||||
|
|
||||||
|
|
||||||
def stream_texts():
|
|
||||||
for line in sys.stdin:
|
|
||||||
yield ujson.loads(line)
|
|
||||||
|
|
||||||
|
|
||||||
def make_update(model, docs, optimizer, drop=0.0):
|
def make_update(model, docs, optimizer, drop=0.0):
|
||||||
"""Perform an update over a single batch of documents.
|
"""Perform an update over a single batch of documents.
|
||||||
|
|
||||||
|
|
|
@ -3,12 +3,12 @@ from __future__ import unicode_literals, division, print_function
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import ujson
|
import srsly
|
||||||
import cProfile
|
import cProfile
|
||||||
import pstats
|
import pstats
|
||||||
import sys
|
import sys
|
||||||
import tqdm
|
import tqdm
|
||||||
import cytoolz
|
import itertools
|
||||||
import thinc.extra.datasets
|
import thinc.extra.datasets
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
|
||||||
|
@ -40,7 +40,7 @@ def profile(model, inputs=None, n_texts=10000):
|
||||||
with msg.loading("Loading model '{}'...".format(model)):
|
with msg.loading("Loading model '{}'...".format(model)):
|
||||||
nlp = load_model(model)
|
nlp = load_model(model)
|
||||||
msg.good("Loaded model '{}'".format(model))
|
msg.good("Loaded model '{}'".format(model))
|
||||||
texts = list(cytoolz.take(n_texts, inputs))
|
texts = list(itertools.islice(inputs, n_texts))
|
||||||
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
|
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
|
||||||
s = pstats.Stats("Profile.prof")
|
s = pstats.Stats("Profile.prof")
|
||||||
msg.divider("Profile stats")
|
msg.divider("Profile stats")
|
||||||
|
@ -64,6 +64,6 @@ def _read_inputs(loc, msg):
|
||||||
msg.info("Using data from {}".format(input_path.parts[-1]))
|
msg.info("Using data from {}".format(input_path.parts[-1]))
|
||||||
file_ = input_path.open()
|
file_ = input_path.open()
|
||||||
for line in file_:
|
for line in file_:
|
||||||
data = ujson.loads(line)
|
data = srsly.json_loads(line)
|
||||||
text = data["text"]
|
text = data["text"]
|
||||||
yield text
|
yield text
|
||||||
|
|
|
@ -3,9 +3,9 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from jsonschema import Draft4Validator
|
from jsonschema import Draft4Validator
|
||||||
|
import srsly
|
||||||
|
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
from ...util import read_json
|
|
||||||
|
|
||||||
|
|
||||||
SCHEMAS = {}
|
SCHEMAS = {}
|
||||||
|
@ -25,7 +25,7 @@ def get_schema(name):
|
||||||
schema_path = Path(__file__).parent / "{}.json".format(name)
|
schema_path = Path(__file__).parent / "{}.json".format(name)
|
||||||
if not schema_path.exists():
|
if not schema_path.exists():
|
||||||
raise ValueError(Errors.E104.format(name=name))
|
raise ValueError(Errors.E104.format(name=name))
|
||||||
schema = read_json(schema_path)
|
schema = srsly.read_json(schema_path)
|
||||||
# TODO: replace with (stable) Draft6Validator, if available
|
# TODO: replace with (stable) Draft6Validator, if available
|
||||||
validator = Draft4Validator(schema)
|
validator = Draft4Validator(schema)
|
||||||
validator.check_schema(schema)
|
validator.check_schema(schema)
|
||||||
|
|
|
@ -7,6 +7,7 @@ import tqdm
|
||||||
from thinc.neural._classes.model import Model
|
from thinc.neural._classes.model import Model
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
import shutil
|
import shutil
|
||||||
|
import srsly
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
|
||||||
from ._messages import Messages
|
from ._messages import Messages
|
||||||
|
@ -111,7 +112,7 @@ def train(
|
||||||
msg.fail(Messages.M051, dev_path, exits=1)
|
msg.fail(Messages.M051, dev_path, exits=1)
|
||||||
if meta_path is not None and not meta_path.exists():
|
if meta_path is not None and not meta_path.exists():
|
||||||
msg.fail(Messages.M020, meta_path, exits=1)
|
msg.fail(Messages.M020, meta_path, exits=1)
|
||||||
meta = util.read_json(meta_path) if meta_path else {}
|
meta = srsly.read_json(meta_path) if meta_path else {}
|
||||||
if not isinstance(meta, dict):
|
if not isinstance(meta, dict):
|
||||||
msg.fail(Messages.M052, Messages.M053.format(meta_type=type(meta)), exits=1)
|
msg.fail(Messages.M052, Messages.M053.format(meta_type=type(meta)), exits=1)
|
||||||
if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
|
if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
|
||||||
|
@ -226,7 +227,7 @@ def train(
|
||||||
end_time = timer()
|
end_time = timer()
|
||||||
cpu_wps = nwords / (end_time - start_time)
|
cpu_wps = nwords / (end_time - start_time)
|
||||||
acc_loc = output_path / ("model%d" % i) / "accuracy.json"
|
acc_loc = output_path / ("model%d" % i) / "accuracy.json"
|
||||||
util.write_json(acc_loc, scorer.scores)
|
srsly.write_json(acc_loc, scorer.scores)
|
||||||
|
|
||||||
# Update model meta.json
|
# Update model meta.json
|
||||||
meta["lang"] = nlp.lang
|
meta["lang"] = nlp.lang
|
||||||
|
@ -242,7 +243,7 @@ def train(
|
||||||
meta.setdefault("name", "model%d" % i)
|
meta.setdefault("name", "model%d" % i)
|
||||||
meta.setdefault("version", version)
|
meta.setdefault("version", version)
|
||||||
meta_loc = output_path / ("model%d" % i) / "meta.json"
|
meta_loc = output_path / ("model%d" % i) / "meta.json"
|
||||||
util.write_json(meta_loc, meta)
|
srsly.write_json(meta_loc, meta)
|
||||||
|
|
||||||
util.set_env_log(verbose)
|
util.set_env_log(verbose)
|
||||||
|
|
||||||
|
@ -293,17 +294,17 @@ def _collate_best_model(meta, output_path, components):
|
||||||
for component, best_component_src in bests.items():
|
for component, best_component_src in bests.items():
|
||||||
shutil.rmtree(best_dest / component)
|
shutil.rmtree(best_dest / component)
|
||||||
shutil.copytree(best_component_src / component, best_dest / component)
|
shutil.copytree(best_component_src / component, best_dest / component)
|
||||||
accs = util.read_json(best_component_src / "accuracy.json")
|
accs = srsly.read_json(best_component_src / "accuracy.json")
|
||||||
for metric in _get_metrics(component):
|
for metric in _get_metrics(component):
|
||||||
meta["accuracy"][metric] = accs[metric]
|
meta["accuracy"][metric] = accs[metric]
|
||||||
util.write_json(best_dest / "meta.json", meta)
|
srsly.write_json(best_dest / "meta.json", meta)
|
||||||
|
|
||||||
|
|
||||||
def _find_best(experiment_dir, component):
|
def _find_best(experiment_dir, component):
|
||||||
accuracies = []
|
accuracies = []
|
||||||
for epoch_model in experiment_dir.iterdir():
|
for epoch_model in experiment_dir.iterdir():
|
||||||
if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final":
|
if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final":
|
||||||
accs = util.read_json(epoch_model / "accuracy.json")
|
accs = srsly.read_json(epoch_model / "accuracy.json")
|
||||||
scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)]
|
scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)]
|
||||||
accuracies.append((scores, epoch_model))
|
accuracies.append((scores, epoch_model))
|
||||||
if accuracies:
|
if accuracies:
|
||||||
|
|
|
@ -9,7 +9,7 @@ import tqdm
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import json
|
import srsly
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
import spacy.util
|
import spacy.util
|
||||||
|
@ -30,7 +30,6 @@ Fused_inside = None
|
||||||
import itertools
|
import itertools
|
||||||
import random
|
import random
|
||||||
import numpy.random
|
import numpy.random
|
||||||
import cytoolz
|
|
||||||
|
|
||||||
from . import conll17_ud_eval
|
from . import conll17_ud_eval
|
||||||
|
|
||||||
|
@ -44,7 +43,7 @@ from ...lang import ru
|
||||||
# Data reading #
|
# Data reading #
|
||||||
################
|
################
|
||||||
|
|
||||||
space_re = re.compile("\s+")
|
space_re = re.compile(r"\s+")
|
||||||
|
|
||||||
|
|
||||||
def split_text(text):
|
def split_text(text):
|
||||||
|
@ -332,8 +331,7 @@ def main(test_data_dir, experiment_dir, corpus):
|
||||||
/ corpus
|
/ corpus
|
||||||
/ "{section}-accuracy.json".format(section=section)
|
/ "{section}-accuracy.json".format(section=section)
|
||||||
)
|
)
|
||||||
with open(acc_path, "w") as file_:
|
srsly.write_json(acc_path, accuracy)
|
||||||
file_.write(json.dumps(accuracy, indent=2))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -25,7 +25,6 @@ from timeit import default_timer as timer
|
||||||
import itertools
|
import itertools
|
||||||
import random
|
import random
|
||||||
import numpy.random
|
import numpy.random
|
||||||
import cytoolz
|
|
||||||
|
|
||||||
from . import conll17_ud_eval
|
from . import conll17_ud_eval
|
||||||
|
|
||||||
|
|
|
@ -5,11 +5,12 @@ import pkg_resources
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import sys
|
import sys
|
||||||
import requests
|
import requests
|
||||||
|
import srsly
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
|
||||||
from ._messages import Messages
|
from ._messages import Messages
|
||||||
from ..compat import path2str
|
from ..compat import path2str
|
||||||
from ..util import get_data_path, read_json
|
from ..util import get_data_path
|
||||||
from .. import about
|
from .. import about
|
||||||
|
|
||||||
|
|
||||||
|
@ -84,7 +85,7 @@ def get_model_links(compat):
|
||||||
meta_path = Path(model) / "meta.json"
|
meta_path = Path(model) / "meta.json"
|
||||||
if not meta_path.exists():
|
if not meta_path.exists():
|
||||||
continue
|
continue
|
||||||
meta = read_json(meta_path)
|
meta = srsly.read_json(meta_path)
|
||||||
link = model.parts[-1]
|
link = model.parts[-1]
|
||||||
name = meta["lang"] + "_" + meta["name"]
|
name = meta["lang"] + "_" + meta["name"]
|
||||||
links[link] = {
|
links[link] = {
|
||||||
|
|
|
@ -3,7 +3,6 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import ujson
|
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
from thinc.neural.util import copy_array
|
from thinc.neural.util import copy_array
|
||||||
|
@ -54,9 +53,6 @@ if is_python2:
|
||||||
unicode_ = unicode # noqa: F821
|
unicode_ = unicode # noqa: F821
|
||||||
basestring_ = basestring # noqa: F821
|
basestring_ = basestring # noqa: F821
|
||||||
input_ = raw_input # noqa: F821
|
input_ = raw_input # noqa: F821
|
||||||
json_dumps = lambda data, indent=2: ujson.dumps(
|
|
||||||
data, indent=indent, escape_forward_slashes=False
|
|
||||||
).decode("utf8")
|
|
||||||
path2str = lambda path: str(path).decode("utf8")
|
path2str = lambda path: str(path).decode("utf8")
|
||||||
|
|
||||||
elif is_python3:
|
elif is_python3:
|
||||||
|
@ -64,9 +60,6 @@ elif is_python3:
|
||||||
unicode_ = str
|
unicode_ = str
|
||||||
basestring_ = str
|
basestring_ = str
|
||||||
input_ = input
|
input_ = input
|
||||||
json_dumps = lambda data, indent=2: ujson.dumps(
|
|
||||||
data, indent=indent, escape_forward_slashes=False
|
|
||||||
)
|
|
||||||
path2str = lambda path: str(path)
|
path2str = lambda path: str(path)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -4,16 +4,11 @@ from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import random
|
import random
|
||||||
import cytoolz
|
|
||||||
import itertools
|
|
||||||
import numpy
|
import numpy
|
||||||
import tempfile
|
import tempfile
|
||||||
import shutil
|
import shutil
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import msgpack
|
import srsly
|
||||||
import json
|
|
||||||
|
|
||||||
import ujson
|
|
||||||
|
|
||||||
from . import _align
|
from . import _align
|
||||||
from .syntax import nonproj
|
from .syntax import nonproj
|
||||||
|
@ -21,7 +16,6 @@ from .tokens import Doc
|
||||||
from .errors import Errors
|
from .errors import Errors
|
||||||
from . import util
|
from . import util
|
||||||
from .util import minibatch, itershuffle
|
from .util import minibatch, itershuffle
|
||||||
from .compat import json_dumps
|
|
||||||
|
|
||||||
from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek
|
from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek
|
||||||
|
|
||||||
|
@ -123,12 +117,11 @@ class GoldCorpus(object):
|
||||||
directory.mkdir()
|
directory.mkdir()
|
||||||
n = 0
|
n = 0
|
||||||
for i, doc_tuple in enumerate(doc_tuples):
|
for i, doc_tuple in enumerate(doc_tuples):
|
||||||
with open(directory / '{}.msg'.format(i), 'wb') as file_:
|
srsly.write_msgpack(directory / '{}.msg'.format(i), [doc_tuple])
|
||||||
msgpack.dump([doc_tuple], file_, use_bin_type=True)
|
|
||||||
n += len(doc_tuple[1])
|
n += len(doc_tuple[1])
|
||||||
if limit and n >= limit:
|
if limit and n >= limit:
|
||||||
break
|
break
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def walk_corpus(path):
|
def walk_corpus(path):
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
|
@ -157,8 +150,7 @@ class GoldCorpus(object):
|
||||||
if loc.parts[-1].endswith('json'):
|
if loc.parts[-1].endswith('json'):
|
||||||
gold_tuples = read_json_file(loc)
|
gold_tuples = read_json_file(loc)
|
||||||
elif loc.parts[-1].endswith('msg'):
|
elif loc.parts[-1].endswith('msg'):
|
||||||
with loc.open('rb') as file_:
|
gold_tuples = srsly.read_msgpack(loc)
|
||||||
gold_tuples = msgpack.load(file_, raw=False)
|
|
||||||
else:
|
else:
|
||||||
msg = "Cannot read from file: %s. Supported formats: .json, .msg"
|
msg = "Cannot read from file: %s. Supported formats: .json, .msg"
|
||||||
raise ValueError(msg % loc)
|
raise ValueError(msg % loc)
|
||||||
|
@ -378,7 +370,7 @@ def _json_iterate(loc):
|
||||||
if square_depth == 1 and curly_depth == 0:
|
if square_depth == 1 and curly_depth == 0:
|
||||||
py_str = py_raw[start : i+1].decode('utf8')
|
py_str = py_raw[start : i+1].decode('utf8')
|
||||||
try:
|
try:
|
||||||
yield json.loads(py_str)
|
yield srsly.json_loads(py_str)
|
||||||
except Exception:
|
except Exception:
|
||||||
print(py_str)
|
print(py_str)
|
||||||
raise
|
raise
|
||||||
|
|
|
@ -2,7 +2,6 @@
|
||||||
from __future__ import absolute_import, unicode_literals
|
from __future__ import absolute_import, unicode_literals
|
||||||
|
|
||||||
import random
|
import random
|
||||||
import ujson
|
|
||||||
import itertools
|
import itertools
|
||||||
import weakref
|
import weakref
|
||||||
import functools
|
import functools
|
||||||
|
@ -10,6 +9,7 @@ from collections import OrderedDict
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from copy import copy
|
from copy import copy
|
||||||
from thinc.neural import Model
|
from thinc.neural import Model
|
||||||
|
import srsly
|
||||||
|
|
||||||
from .tokenizer import Tokenizer
|
from .tokenizer import Tokenizer
|
||||||
from .vocab import Vocab
|
from .vocab import Vocab
|
||||||
|
@ -18,7 +18,7 @@ from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
|
||||||
from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter
|
from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter
|
||||||
from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
|
from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
|
||||||
from .pipeline import EntityRuler
|
from .pipeline import EntityRuler
|
||||||
from .compat import json_dumps, izip, basestring_
|
from .compat import izip, basestring_
|
||||||
from .gold import GoldParse
|
from .gold import GoldParse
|
||||||
from .scorer import Scorer
|
from .scorer import Scorer
|
||||||
from ._ml import link_vectors_to_models, create_default_optimizer
|
from ._ml import link_vectors_to_models, create_default_optimizer
|
||||||
|
@ -640,7 +640,7 @@ class Language(object):
|
||||||
serializers = OrderedDict(
|
serializers = OrderedDict(
|
||||||
(
|
(
|
||||||
("tokenizer", lambda p: self.tokenizer.to_disk(p, vocab=False)),
|
("tokenizer", lambda p: self.tokenizer.to_disk(p, vocab=False)),
|
||||||
("meta.json", lambda p: p.open("w").write(json_dumps(self.meta))),
|
("meta.json", lambda p: p.open("w").write(srsly.json_dumps(self.meta))),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
|
@ -671,7 +671,7 @@ class Language(object):
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
deserializers = OrderedDict(
|
deserializers = OrderedDict(
|
||||||
(
|
(
|
||||||
("meta.json", lambda p: self.meta.update(util.read_json(p))),
|
("meta.json", lambda p: self.meta.update(srsly.read_json(p))),
|
||||||
(
|
(
|
||||||
"vocab",
|
"vocab",
|
||||||
lambda p: (
|
lambda p: (
|
||||||
|
@ -705,7 +705,7 @@ class Language(object):
|
||||||
(
|
(
|
||||||
("vocab", lambda: self.vocab.to_bytes()),
|
("vocab", lambda: self.vocab.to_bytes()),
|
||||||
("tokenizer", lambda: self.tokenizer.to_bytes(vocab=False)),
|
("tokenizer", lambda: self.tokenizer.to_bytes(vocab=False)),
|
||||||
("meta", lambda: json_dumps(self.meta)),
|
("meta", lambda: srsly.json_dumps(self.meta)),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
for i, (name, proc) in enumerate(self.pipeline):
|
for i, (name, proc) in enumerate(self.pipeline):
|
||||||
|
@ -725,7 +725,7 @@ class Language(object):
|
||||||
"""
|
"""
|
||||||
deserializers = OrderedDict(
|
deserializers = OrderedDict(
|
||||||
(
|
(
|
||||||
("meta", lambda b: self.meta.update(ujson.loads(b))),
|
("meta", lambda b: self.meta.update(srsly.json_loads(b))),
|
||||||
(
|
(
|
||||||
"vocab",
|
"vocab",
|
||||||
lambda b: (
|
lambda b: (
|
||||||
|
|
|
@ -5,12 +5,8 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
import cytoolz
|
|
||||||
from collections import OrderedDict, defaultdict
|
from collections import OrderedDict, defaultdict
|
||||||
import ujson
|
import srsly
|
||||||
|
|
||||||
from .util import msgpack
|
|
||||||
from .util import msgpack_numpy
|
|
||||||
|
|
||||||
from thinc.api import chain
|
from thinc.api import chain
|
||||||
from thinc.v2v import Affine, Maxout, Softmax
|
from thinc.v2v import Affine, Maxout, Softmax
|
||||||
|
@ -27,7 +23,6 @@ from .syntax.arc_eager cimport ArcEager
|
||||||
from .morphology cimport Morphology
|
from .morphology cimport Morphology
|
||||||
from .vocab cimport Vocab
|
from .vocab cimport Vocab
|
||||||
from .syntax import nonproj
|
from .syntax import nonproj
|
||||||
from .compat import json_dumps
|
|
||||||
from .matcher import Matcher
|
from .matcher import Matcher
|
||||||
|
|
||||||
from .matcher import Matcher, PhraseMatcher
|
from .matcher import Matcher, PhraseMatcher
|
||||||
|
@ -38,7 +33,7 @@ from ._ml import Tok2Vec, build_text_classifier, build_tagger_model
|
||||||
from ._ml import link_vectors_to_models, zero_init, flatten
|
from ._ml import link_vectors_to_models, zero_init, flatten
|
||||||
from ._ml import create_default_optimizer
|
from ._ml import create_default_optimizer
|
||||||
from .errors import Errors, TempErrors
|
from .errors import Errors, TempErrors
|
||||||
from .compat import json_dumps, basestring_
|
from .compat import basestring_
|
||||||
from . import util
|
from . import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -235,7 +230,7 @@ class EntityRuler(object):
|
||||||
**kwargs: Other config paramters, mostly for consistency.
|
**kwargs: Other config paramters, mostly for consistency.
|
||||||
RETURNS (EntityRuler): The loaded entity ruler.
|
RETURNS (EntityRuler): The loaded entity ruler.
|
||||||
"""
|
"""
|
||||||
patterns = msgpack.loads(patterns_bytes, raw=False)
|
patterns = srsly.msgpack_loads(patterns_bytes)
|
||||||
self.add_patterns(patterns)
|
self.add_patterns(patterns)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
@ -244,7 +239,7 @@ class EntityRuler(object):
|
||||||
|
|
||||||
RETURNS (bytes): The serialized patterns.
|
RETURNS (bytes): The serialized patterns.
|
||||||
"""
|
"""
|
||||||
return msgpack.dumps(self.patterns, use_bin_type=True)
|
return srsly.msgpack_dumps(self.patterns)
|
||||||
|
|
||||||
def from_disk(self, path, **kwargs):
|
def from_disk(self, path, **kwargs):
|
||||||
"""Load the entity ruler from a file. Expects a file containing
|
"""Load the entity ruler from a file. Expects a file containing
|
||||||
|
@ -256,7 +251,7 @@ class EntityRuler(object):
|
||||||
"""
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
path = path.with_suffix('.jsonl')
|
path = path.with_suffix('.jsonl')
|
||||||
patterns = util.read_jsonl(path)
|
patterns = srsly.read_jsonl(path)
|
||||||
self.add_patterns(patterns)
|
self.add_patterns(patterns)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
@ -270,8 +265,7 @@ class EntityRuler(object):
|
||||||
"""
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
path = path.with_suffix('.jsonl')
|
path = path.with_suffix('.jsonl')
|
||||||
data = [json_dumps(line, indent=0) for line in self.patterns]
|
srsly.write_jsonl(path, self.patterns)
|
||||||
path.open('w').write('\n'.join(data))
|
|
||||||
|
|
||||||
|
|
||||||
class Pipe(object):
|
class Pipe(object):
|
||||||
|
@ -307,7 +301,7 @@ class Pipe(object):
|
||||||
Both __call__ and pipe should delegate to the `predict()`
|
Both __call__ and pipe should delegate to the `predict()`
|
||||||
and `set_annotations()` methods.
|
and `set_annotations()` methods.
|
||||||
"""
|
"""
|
||||||
for docs in cytoolz.partition_all(batch_size, stream):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
docs = list(docs)
|
docs = list(docs)
|
||||||
scores, tensors = self.predict(docs)
|
scores, tensors = self.predict(docs)
|
||||||
self.set_annotations(docs, scores, tensor=tensors)
|
self.set_annotations(docs, scores, tensor=tensors)
|
||||||
|
@ -368,7 +362,7 @@ class Pipe(object):
|
||||||
def to_bytes(self, **exclude):
|
def to_bytes(self, **exclude):
|
||||||
"""Serialize the pipe to a bytestring."""
|
"""Serialize the pipe to a bytestring."""
|
||||||
serialize = OrderedDict()
|
serialize = OrderedDict()
|
||||||
serialize['cfg'] = lambda: json_dumps(self.cfg)
|
serialize['cfg'] = lambda: srsly.json_dumps(self.cfg)
|
||||||
if self.model in (True, False, None):
|
if self.model in (True, False, None):
|
||||||
serialize['model'] = lambda: self.model
|
serialize['model'] = lambda: self.model
|
||||||
else:
|
else:
|
||||||
|
@ -387,7 +381,7 @@ class Pipe(object):
|
||||||
self.model.from_bytes(b)
|
self.model.from_bytes(b)
|
||||||
|
|
||||||
deserialize = OrderedDict((
|
deserialize = OrderedDict((
|
||||||
('cfg', lambda b: self.cfg.update(ujson.loads(b))),
|
('cfg', lambda b: self.cfg.update(srsly.json_loads(b))),
|
||||||
('vocab', lambda b: self.vocab.from_bytes(b)),
|
('vocab', lambda b: self.vocab.from_bytes(b)),
|
||||||
('model', load_model),
|
('model', load_model),
|
||||||
))
|
))
|
||||||
|
@ -397,7 +391,7 @@ class Pipe(object):
|
||||||
def to_disk(self, path, **exclude):
|
def to_disk(self, path, **exclude):
|
||||||
"""Serialize the pipe to disk."""
|
"""Serialize the pipe to disk."""
|
||||||
serialize = OrderedDict()
|
serialize = OrderedDict()
|
||||||
serialize['cfg'] = lambda p: p.open('w').write(json_dumps(self.cfg))
|
serialize['cfg'] = lambda p: srsly.write_json(p, self.cfg)
|
||||||
serialize['vocab'] = lambda p: self.vocab.to_disk(p)
|
serialize['vocab'] = lambda p: self.vocab.to_disk(p)
|
||||||
if self.model not in (None, True, False):
|
if self.model not in (None, True, False):
|
||||||
serialize['model'] = lambda p: p.open('wb').write(self.model.to_bytes())
|
serialize['model'] = lambda p: p.open('wb').write(self.model.to_bytes())
|
||||||
|
@ -424,8 +418,7 @@ class Pipe(object):
|
||||||
|
|
||||||
def _load_cfg(path):
|
def _load_cfg(path):
|
||||||
if path.exists():
|
if path.exists():
|
||||||
with path.open() as file_:
|
return srsly.read_json(path)
|
||||||
return ujson.load(file_)
|
|
||||||
else:
|
else:
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
@ -485,7 +478,7 @@ class Tensorizer(Pipe):
|
||||||
n_threads (int): Number of threads.
|
n_threads (int): Number of threads.
|
||||||
YIELDS (iterator): A sequence of `Doc` objects, in order of input.
|
YIELDS (iterator): A sequence of `Doc` objects, in order of input.
|
||||||
"""
|
"""
|
||||||
for docs in cytoolz.partition_all(batch_size, stream):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
docs = list(docs)
|
docs = list(docs)
|
||||||
tensors = self.predict(docs)
|
tensors = self.predict(docs)
|
||||||
self.set_annotations(docs, tensors)
|
self.set_annotations(docs, tensors)
|
||||||
|
@ -594,7 +587,7 @@ class Tagger(Pipe):
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||||
for docs in cytoolz.partition_all(batch_size, stream):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
docs = list(docs)
|
docs = list(docs)
|
||||||
tag_ids, tokvecs = self.predict(docs)
|
tag_ids, tokvecs = self.predict(docs)
|
||||||
self.set_annotations(docs, tag_ids, tensors=tokvecs)
|
self.set_annotations(docs, tag_ids, tensors=tokvecs)
|
||||||
|
@ -745,10 +738,9 @@ class Tagger(Pipe):
|
||||||
else:
|
else:
|
||||||
serialize['model'] = self.model.to_bytes
|
serialize['model'] = self.model.to_bytes
|
||||||
serialize['vocab'] = self.vocab.to_bytes
|
serialize['vocab'] = self.vocab.to_bytes
|
||||||
serialize['cfg'] = lambda: ujson.dumps(self.cfg)
|
serialize['cfg'] = lambda: srsly.json_dumps(self.cfg)
|
||||||
tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
|
tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
|
||||||
serialize['tag_map'] = lambda: msgpack.dumps(
|
serialize['tag_map'] = lambda: srsly.msgpack_dumps(tag_map)
|
||||||
tag_map, use_bin_type=True)
|
|
||||||
return util.to_bytes(serialize, exclude)
|
return util.to_bytes(serialize, exclude)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, **exclude):
|
def from_bytes(self, bytes_data, **exclude):
|
||||||
|
@ -766,7 +758,7 @@ class Tagger(Pipe):
|
||||||
self.model.from_bytes(b)
|
self.model.from_bytes(b)
|
||||||
|
|
||||||
def load_tag_map(b):
|
def load_tag_map(b):
|
||||||
tag_map = msgpack.loads(b, raw=False)
|
tag_map = srsly.msgpack_loads(b)
|
||||||
self.vocab.morphology = Morphology(
|
self.vocab.morphology = Morphology(
|
||||||
self.vocab.strings, tag_map=tag_map,
|
self.vocab.strings, tag_map=tag_map,
|
||||||
lemmatizer=self.vocab.morphology.lemmatizer,
|
lemmatizer=self.vocab.morphology.lemmatizer,
|
||||||
|
@ -775,7 +767,7 @@ class Tagger(Pipe):
|
||||||
deserialize = OrderedDict((
|
deserialize = OrderedDict((
|
||||||
('vocab', lambda b: self.vocab.from_bytes(b)),
|
('vocab', lambda b: self.vocab.from_bytes(b)),
|
||||||
('tag_map', load_tag_map),
|
('tag_map', load_tag_map),
|
||||||
('cfg', lambda b: self.cfg.update(ujson.loads(b))),
|
('cfg', lambda b: self.cfg.update(srsly.json_loads(b))),
|
||||||
('model', lambda b: load_model(b)),
|
('model', lambda b: load_model(b)),
|
||||||
))
|
))
|
||||||
util.from_bytes(bytes_data, deserialize, exclude)
|
util.from_bytes(bytes_data, deserialize, exclude)
|
||||||
|
@ -785,10 +777,9 @@ class Tagger(Pipe):
|
||||||
tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
|
tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
|
||||||
serialize = OrderedDict((
|
serialize = OrderedDict((
|
||||||
('vocab', lambda p: self.vocab.to_disk(p)),
|
('vocab', lambda p: self.vocab.to_disk(p)),
|
||||||
('tag_map', lambda p: p.open('wb').write(msgpack.dumps(
|
('tag_map', lambda p: srsly.write_msgpack(p, tag_map)),
|
||||||
tag_map, use_bin_type=True))),
|
|
||||||
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
|
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
|
||||||
('cfg', lambda p: p.open('w').write(json_dumps(self.cfg)))
|
('cfg', lambda p: srsly.write_json(p, self.cfg))
|
||||||
))
|
))
|
||||||
util.to_disk(path, serialize, exclude)
|
util.to_disk(path, serialize, exclude)
|
||||||
|
|
||||||
|
@ -803,8 +794,7 @@ class Tagger(Pipe):
|
||||||
self.model.from_bytes(file_.read())
|
self.model.from_bytes(file_.read())
|
||||||
|
|
||||||
def load_tag_map(p):
|
def load_tag_map(p):
|
||||||
with p.open('rb') as file_:
|
tag_map = srsly.read_msgpack(p)
|
||||||
tag_map = msgpack.loads(file_.read(), raw=False)
|
|
||||||
self.vocab.morphology = Morphology(
|
self.vocab.morphology = Morphology(
|
||||||
self.vocab.strings, tag_map=tag_map,
|
self.vocab.strings, tag_map=tag_map,
|
||||||
lemmatizer=self.vocab.morphology.lemmatizer,
|
lemmatizer=self.vocab.morphology.lemmatizer,
|
||||||
|
@ -1082,7 +1072,7 @@ class TextCategorizer(Pipe):
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||||
for docs in cytoolz.partition_all(batch_size, stream):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
docs = list(docs)
|
docs = list(docs)
|
||||||
scores, tensors = self.predict(docs)
|
scores, tensors = self.predict(docs)
|
||||||
self.set_annotations(docs, scores, tensors=tensors)
|
self.set_annotations(docs, scores, tensors=tensors)
|
||||||
|
|
|
@ -7,12 +7,11 @@ from libc.string cimport memcpy
|
||||||
from libcpp.set cimport set
|
from libcpp.set cimport set
|
||||||
from libc.stdint cimport uint32_t
|
from libc.stdint cimport uint32_t
|
||||||
from murmurhash.mrmr cimport hash64, hash32
|
from murmurhash.mrmr cimport hash64, hash32
|
||||||
import ujson
|
import srsly
|
||||||
|
|
||||||
from .symbols import IDS as SYMBOLS_BY_STR
|
from .symbols import IDS as SYMBOLS_BY_STR
|
||||||
from .symbols import NAMES as SYMBOLS_BY_INT
|
from .symbols import NAMES as SYMBOLS_BY_INT
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t
|
||||||
from .compat import json_dumps
|
|
||||||
from .errors import Errors
|
from .errors import Errors
|
||||||
from . import util
|
from . import util
|
||||||
|
|
||||||
|
@ -197,8 +196,7 @@ cdef class StringStore:
|
||||||
"""
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
strings = list(self)
|
strings = list(self)
|
||||||
with path.open('w') as file_:
|
srsly.write_json(path, strings)
|
||||||
file_.write(json_dumps(strings))
|
|
||||||
|
|
||||||
def from_disk(self, path):
|
def from_disk(self, path):
|
||||||
"""Loads state from a directory. Modifies the object in place and
|
"""Loads state from a directory. Modifies the object in place and
|
||||||
|
@ -209,8 +207,7 @@ cdef class StringStore:
|
||||||
RETURNS (StringStore): The modified `StringStore` object.
|
RETURNS (StringStore): The modified `StringStore` object.
|
||||||
"""
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
with path.open('r') as file_:
|
strings = srsly.read_json(path)
|
||||||
strings = ujson.load(file_)
|
|
||||||
prev = list(self)
|
prev = list(self)
|
||||||
self._reset_and_load(strings)
|
self._reset_and_load(strings)
|
||||||
for word in prev:
|
for word in prev:
|
||||||
|
@ -223,7 +220,7 @@ cdef class StringStore:
|
||||||
**exclude: Named attributes to prevent from being serialized.
|
**exclude: Named attributes to prevent from being serialized.
|
||||||
RETURNS (bytes): The serialized form of the `StringStore` object.
|
RETURNS (bytes): The serialized form of the `StringStore` object.
|
||||||
"""
|
"""
|
||||||
return json_dumps(list(self))
|
return srsly.json_dumps(list(self))
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, **exclude):
|
def from_bytes(self, bytes_data, **exclude):
|
||||||
"""Load state from a binary string.
|
"""Load state from a binary string.
|
||||||
|
@ -232,7 +229,7 @@ cdef class StringStore:
|
||||||
**exclude: Named attributes to prevent from being loaded.
|
**exclude: Named attributes to prevent from being loaded.
|
||||||
RETURNS (StringStore): The `StringStore` object.
|
RETURNS (StringStore): The `StringStore` object.
|
||||||
"""
|
"""
|
||||||
strings = ujson.loads(bytes_data)
|
strings = srsly.json_loads(bytes_data)
|
||||||
prev = list(self)
|
prev = list(self)
|
||||||
self._reset_and_load(strings)
|
self._reset_and_load(strings)
|
||||||
for word in prev:
|
for word in prev:
|
||||||
|
|
|
@ -5,11 +5,8 @@
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
import ujson
|
|
||||||
import json
|
|
||||||
import numpy
|
import numpy
|
||||||
cimport cython.parallel
|
cimport cython.parallel
|
||||||
import cytoolz
|
|
||||||
import numpy.random
|
import numpy.random
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
from libc.math cimport exp
|
from libc.math cimport exp
|
||||||
|
@ -29,7 +26,7 @@ cimport blis.cy
|
||||||
|
|
||||||
from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten
|
from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten
|
||||||
from .._ml import link_vectors_to_models, create_default_optimizer
|
from .._ml import link_vectors_to_models, create_default_optimizer
|
||||||
from ..compat import json_dumps, copy_array
|
from ..compat import copy_array
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from ..gold cimport GoldParse
|
from ..gold cimport GoldParse
|
||||||
from ..errors import Errors, TempErrors
|
from ..errors import Errors, TempErrors
|
||||||
|
@ -119,7 +116,7 @@ cdef void predict_states(ActivationsC* A, StateC** states,
|
||||||
VecVec.add_i(&A.scores[i*n.classes],
|
VecVec.add_i(&A.scores[i*n.classes],
|
||||||
W.hidden_bias, 1., n.classes)
|
W.hidden_bias, 1., n.classes)
|
||||||
|
|
||||||
|
|
||||||
cdef void sum_state_features(float* output,
|
cdef void sum_state_features(float* output,
|
||||||
const float* cached, const int* token_ids, int B, int F, int O) nogil:
|
const float* cached, const int* token_ids, int B, int F, int O) nogil:
|
||||||
cdef int idx, b, f, i
|
cdef int idx, b, f, i
|
||||||
|
@ -165,7 +162,7 @@ cdef void cpu_log_loss(float* d_scores,
|
||||||
else:
|
else:
|
||||||
d_scores[i] = exp(scores[i]-max_) / Z
|
d_scores[i] = exp(scores[i]-max_) / Z
|
||||||
|
|
||||||
|
|
||||||
cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs,
|
cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs,
|
||||||
const int* is_valid, int n) nogil:
|
const int* is_valid, int n) nogil:
|
||||||
# Find minimum cost
|
# Find minimum cost
|
||||||
|
@ -218,15 +215,15 @@ class ParserModel(Model):
|
||||||
|
|
||||||
def begin_training(self, X, y=None):
|
def begin_training(self, X, y=None):
|
||||||
self.lower.begin_training(X, y=y)
|
self.lower.begin_training(X, y=y)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def tok2vec(self):
|
def tok2vec(self):
|
||||||
return self._layers[0]
|
return self._layers[0]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def lower(self):
|
def lower(self):
|
||||||
return self._layers[1]
|
return self._layers[1]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def upper(self):
|
def upper(self):
|
||||||
return self._layers[2]
|
return self._layers[2]
|
||||||
|
@ -405,4 +402,3 @@ cdef class precompute_hiddens:
|
||||||
else:
|
else:
|
||||||
return self.ops.backprop_maxout(d_best, mask, self.nP)
|
return self.ops.backprop_maxout(d_best, mask, self.nP)
|
||||||
return state_vector, backprop_nonlinearity
|
return state_vector, backprop_nonlinearity
|
||||||
|
|
||||||
|
|
|
@ -5,13 +5,11 @@
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
import ujson
|
|
||||||
import json
|
|
||||||
import numpy
|
import numpy
|
||||||
cimport cython.parallel
|
cimport cython.parallel
|
||||||
import cytoolz
|
|
||||||
import numpy.random
|
import numpy.random
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
|
from itertools import islice
|
||||||
from cpython.ref cimport PyObject, Py_XDECREF
|
from cpython.ref cimport PyObject, Py_XDECREF
|
||||||
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
|
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
|
||||||
from libc.math cimport exp
|
from libc.math cimport exp
|
||||||
|
@ -27,6 +25,7 @@ from thinc.misc import LayerNorm
|
||||||
from thinc.neural.ops import CupyOps
|
from thinc.neural.ops import CupyOps
|
||||||
from thinc.neural.util import get_array_module
|
from thinc.neural.util import get_array_module
|
||||||
from thinc.linalg cimport Vec, VecVec
|
from thinc.linalg cimport Vec, VecVec
|
||||||
|
import srsly
|
||||||
|
|
||||||
from ._parser_model cimport resize_activations, predict_states, arg_max_if_valid
|
from ._parser_model cimport resize_activations, predict_states, arg_max_if_valid
|
||||||
from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
|
from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
|
||||||
|
@ -34,7 +33,7 @@ from ._parser_model cimport get_c_weights, get_c_sizes
|
||||||
from ._parser_model import ParserModel
|
from ._parser_model import ParserModel
|
||||||
from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten
|
from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten
|
||||||
from .._ml import link_vectors_to_models, create_default_optimizer
|
from .._ml import link_vectors_to_models, create_default_optimizer
|
||||||
from ..compat import json_dumps, copy_array
|
from ..compat import copy_array
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from ..gold cimport GoldParse
|
from ..gold cimport GoldParse
|
||||||
from ..errors import Errors, TempErrors
|
from ..errors import Errors, TempErrors
|
||||||
|
@ -214,10 +213,10 @@ cdef class Parser:
|
||||||
beam_width = self.cfg.get('beam_width', 1)
|
beam_width = self.cfg.get('beam_width', 1)
|
||||||
beam_density = self.cfg.get('beam_density', 0.)
|
beam_density = self.cfg.get('beam_density', 0.)
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
for batch in cytoolz.partition_all(batch_size, docs):
|
for batch in util.minibatch(docs, size=batch_size):
|
||||||
batch_in_order = list(batch)
|
batch_in_order = list(batch)
|
||||||
by_length = sorted(batch_in_order, key=lambda doc: len(doc))
|
by_length = sorted(batch_in_order, key=lambda doc: len(doc))
|
||||||
for subbatch in cytoolz.partition_all(8, by_length):
|
for subbatch in util.minibatch(by_length, size=batch_size//4):
|
||||||
subbatch = list(subbatch)
|
subbatch = list(subbatch)
|
||||||
parse_states = self.predict(subbatch, beam_width=beam_width,
|
parse_states = self.predict(subbatch, beam_width=beam_width,
|
||||||
beam_density=beam_density)
|
beam_density=beam_density)
|
||||||
|
@ -517,7 +516,7 @@ cdef class Parser:
|
||||||
sgd = self.create_optimizer()
|
sgd = self.create_optimizer()
|
||||||
doc_sample = []
|
doc_sample = []
|
||||||
gold_sample = []
|
gold_sample = []
|
||||||
for raw_text, annots_brackets in cytoolz.take(1000, get_gold_tuples()):
|
for raw_text, annots_brackets in islice(get_gold_tuples(), 1000):
|
||||||
for annots, brackets in annots_brackets:
|
for annots, brackets in annots_brackets:
|
||||||
ids, words, tags, heads, deps, ents = annots
|
ids, words, tags, heads, deps, ents = annots
|
||||||
doc_sample.append(Doc(self.vocab, words=words))
|
doc_sample.append(Doc(self.vocab, words=words))
|
||||||
|
@ -539,7 +538,7 @@ cdef class Parser:
|
||||||
'model': lambda p: (self.model.to_disk(p) if self.model is not True else True),
|
'model': lambda p: (self.model.to_disk(p) if self.model is not True else True),
|
||||||
'vocab': lambda p: self.vocab.to_disk(p),
|
'vocab': lambda p: self.vocab.to_disk(p),
|
||||||
'moves': lambda p: self.moves.to_disk(p, strings=False),
|
'moves': lambda p: self.moves.to_disk(p, strings=False),
|
||||||
'cfg': lambda p: p.open('w').write(json_dumps(self.cfg))
|
'cfg': lambda p: srsly.write_json(p, self.cfg)
|
||||||
}
|
}
|
||||||
util.to_disk(path, serializers, exclude)
|
util.to_disk(path, serializers, exclude)
|
||||||
|
|
||||||
|
@ -547,7 +546,7 @@ cdef class Parser:
|
||||||
deserializers = {
|
deserializers = {
|
||||||
'vocab': lambda p: self.vocab.from_disk(p),
|
'vocab': lambda p: self.vocab.from_disk(p),
|
||||||
'moves': lambda p: self.moves.from_disk(p, strings=False),
|
'moves': lambda p: self.moves.from_disk(p, strings=False),
|
||||||
'cfg': lambda p: self.cfg.update(util.read_json(p)),
|
'cfg': lambda p: self.cfg.update(srsly.read_json(p)),
|
||||||
'model': lambda p: None
|
'model': lambda p: None
|
||||||
}
|
}
|
||||||
util.from_disk(path, deserializers, exclude)
|
util.from_disk(path, deserializers, exclude)
|
||||||
|
@ -568,7 +567,7 @@ cdef class Parser:
|
||||||
('model', lambda: (self.model.to_bytes() if self.model is not True else True)),
|
('model', lambda: (self.model.to_bytes() if self.model is not True else True)),
|
||||||
('vocab', lambda: self.vocab.to_bytes()),
|
('vocab', lambda: self.vocab.to_bytes()),
|
||||||
('moves', lambda: self.moves.to_bytes(strings=False)),
|
('moves', lambda: self.moves.to_bytes(strings=False)),
|
||||||
('cfg', lambda: json.dumps(self.cfg, indent=2, sort_keys=True))
|
('cfg', lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True))
|
||||||
))
|
))
|
||||||
return util.to_bytes(serializers, exclude)
|
return util.to_bytes(serializers, exclude)
|
||||||
|
|
||||||
|
@ -576,7 +575,7 @@ cdef class Parser:
|
||||||
deserializers = OrderedDict((
|
deserializers = OrderedDict((
|
||||||
('vocab', lambda b: self.vocab.from_bytes(b)),
|
('vocab', lambda b: self.vocab.from_bytes(b)),
|
||||||
('moves', lambda b: self.moves.from_bytes(b, strings=False)),
|
('moves', lambda b: self.moves.from_bytes(b, strings=False)),
|
||||||
('cfg', lambda b: self.cfg.update(json.loads(b))),
|
('cfg', lambda b: self.cfg.update(srsly.json_loads(b))),
|
||||||
('model', lambda b: None)
|
('model', lambda b: None)
|
||||||
))
|
))
|
||||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
||||||
|
|
|
@ -7,14 +7,13 @@ from cymem.cymem cimport Pool
|
||||||
from thinc.typedefs cimport weight_t
|
from thinc.typedefs cimport weight_t
|
||||||
from thinc.extra.search cimport Beam
|
from thinc.extra.search cimport Beam
|
||||||
from collections import OrderedDict, Counter
|
from collections import OrderedDict, Counter
|
||||||
import ujson
|
import srsly
|
||||||
|
|
||||||
from . cimport _beam_utils
|
from . cimport _beam_utils
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from ..structs cimport TokenC
|
from ..structs cimport TokenC
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ..typedefs cimport attr_t
|
from ..typedefs cimport attr_t
|
||||||
from ..compat import json_dumps
|
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
@ -153,13 +152,13 @@ cdef class TransitionSystem:
|
||||||
# Make sure we take a copy here, and that we get a Counter
|
# Make sure we take a copy here, and that we get a Counter
|
||||||
self.labels[action] = Counter()
|
self.labels[action] = Counter()
|
||||||
# Have to be careful here: Sorting must be stable, or our model
|
# Have to be careful here: Sorting must be stable, or our model
|
||||||
# won't be read back in correctly.
|
# won't be read back in correctly.
|
||||||
sorted_labels = [(f, L) for L, f in label_freqs.items()]
|
sorted_labels = [(f, L) for L, f in label_freqs.items()]
|
||||||
sorted_labels.sort()
|
sorted_labels.sort()
|
||||||
sorted_labels.reverse()
|
sorted_labels.reverse()
|
||||||
for freq, label_str in sorted_labels:
|
for freq, label_str in sorted_labels:
|
||||||
self.add_action(int(action), label_str)
|
self.add_action(int(action), label_str)
|
||||||
self.labels[action][label_str] = freq
|
self.labels[action][label_str] = freq
|
||||||
|
|
||||||
def add_action(self, int action, label_name):
|
def add_action(self, int action, label_name):
|
||||||
cdef attr_t label_id
|
cdef attr_t label_id
|
||||||
|
@ -204,7 +203,7 @@ cdef class TransitionSystem:
|
||||||
def to_bytes(self, **exclude):
|
def to_bytes(self, **exclude):
|
||||||
transitions = []
|
transitions = []
|
||||||
serializers = {
|
serializers = {
|
||||||
'moves': lambda: json_dumps(self.labels),
|
'moves': lambda: srsly.json_dumps(self.labels),
|
||||||
'strings': lambda: self.strings.to_bytes()
|
'strings': lambda: self.strings.to_bytes()
|
||||||
}
|
}
|
||||||
return util.to_bytes(serializers, exclude)
|
return util.to_bytes(serializers, exclude)
|
||||||
|
@ -212,7 +211,7 @@ cdef class TransitionSystem:
|
||||||
def from_bytes(self, bytes_data, **exclude):
|
def from_bytes(self, bytes_data, **exclude):
|
||||||
labels = {}
|
labels = {}
|
||||||
deserializers = {
|
deserializers = {
|
||||||
'moves': lambda b: labels.update(ujson.loads(b)),
|
'moves': lambda b: labels.update(srsly.json_loads(b)),
|
||||||
'strings': lambda b: self.strings.from_bytes(b)
|
'strings': lambda b: self.strings.from_bytes(b)
|
||||||
}
|
}
|
||||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
||||||
|
|
|
@ -2,8 +2,8 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import dill as pickle
|
|
||||||
import numpy
|
import numpy
|
||||||
|
import srsly
|
||||||
from spacy.strings import StringStore
|
from spacy.strings import StringStore
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.attrs import NORM
|
from spacy.attrs import NORM
|
||||||
|
@ -14,8 +14,8 @@ def test_pickle_string_store(text1, text2):
|
||||||
stringstore = StringStore()
|
stringstore = StringStore()
|
||||||
store1 = stringstore[text1]
|
store1 = stringstore[text1]
|
||||||
store2 = stringstore[text2]
|
store2 = stringstore[text2]
|
||||||
data = pickle.dumps(stringstore, protocol=-1)
|
data = srsly.pickle_dumps(stringstore, protocol=-1)
|
||||||
unpickled = pickle.loads(data)
|
unpickled = srsly.pickle_loads(data)
|
||||||
assert unpickled[text1] == store1
|
assert unpickled[text1] == store1
|
||||||
assert unpickled[text2] == store2
|
assert unpickled[text2] == store2
|
||||||
assert len(stringstore) == len(unpickled)
|
assert len(stringstore) == len(unpickled)
|
||||||
|
@ -29,8 +29,8 @@ def test_pickle_vocab(text1, text2):
|
||||||
lex2 = vocab[text2]
|
lex2 = vocab[text2]
|
||||||
assert lex1.norm_ == text1[:-1]
|
assert lex1.norm_ == text1[:-1]
|
||||||
assert lex2.norm_ == text2[:-1]
|
assert lex2.norm_ == text2[:-1]
|
||||||
data = pickle.dumps(vocab)
|
data = srsly.pickle_dumps(vocab)
|
||||||
unpickled = pickle.loads(data)
|
unpickled = srsly.pickle_loads(data)
|
||||||
assert unpickled[text1].orth == lex1.orth
|
assert unpickled[text1].orth == lex1.orth
|
||||||
assert unpickled[text2].orth == lex2.orth
|
assert unpickled[text2].orth == lex2.orth
|
||||||
assert unpickled[text1].norm == lex1.norm
|
assert unpickled[text1].norm == lex1.norm
|
||||||
|
|
|
@ -5,7 +5,7 @@ import numpy
|
||||||
import tempfile
|
import tempfile
|
||||||
import shutil
|
import shutil
|
||||||
import contextlib
|
import contextlib
|
||||||
import msgpack
|
import srsly
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from spacy.tokens import Doc, Span
|
from spacy.tokens import Doc, Span
|
||||||
from spacy.attrs import POS, HEAD, DEP
|
from spacy.attrs import POS, HEAD, DEP
|
||||||
|
@ -100,8 +100,8 @@ def assert_docs_equal(doc1, doc2):
|
||||||
|
|
||||||
def assert_packed_msg_equal(b1, b2):
|
def assert_packed_msg_equal(b1, b2):
|
||||||
"""Assert that two packed msgpack messages are equal."""
|
"""Assert that two packed msgpack messages are equal."""
|
||||||
msg1 = msgpack.loads(b1, encoding="utf8")
|
msg1 = srsly.msgpack_loads(b1)
|
||||||
msg2 = msgpack.loads(b2, encoding="utf8")
|
msg2 = srsly.msgpack_loads(b2)
|
||||||
assert sorted(msg1.keys()) == sorted(msg2.keys())
|
assert sorted(msg1.keys()) == sorted(msg2.keys())
|
||||||
for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):
|
for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):
|
||||||
assert k1 == k2
|
assert k1 == k2
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
import msgpack
|
|
||||||
import gzip
|
import gzip
|
||||||
|
import srsly
|
||||||
from thinc.neural.ops import NumpyOps
|
from thinc.neural.ops import NumpyOps
|
||||||
|
|
||||||
from ..compat import copy_reg
|
from ..compat import copy_reg
|
||||||
|
@ -74,11 +74,11 @@ class Binder(object):
|
||||||
"lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"),
|
"lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"),
|
||||||
"strings": list(self.strings),
|
"strings": list(self.strings),
|
||||||
}
|
}
|
||||||
return gzip.compress(msgpack.dumps(msg))
|
return gzip.compress(srsly.msgpack_dumps(msg))
|
||||||
|
|
||||||
def from_bytes(self, string):
|
def from_bytes(self, string):
|
||||||
"""Deserialize the binder's annotations from a byte string."""
|
"""Deserialize the binder's annotations from a byte string."""
|
||||||
msg = msgpack.loads(gzip.decompress(string))
|
msg = srsly.msgpack_loads(gzip.decompress(string))
|
||||||
self.attrs = msg["attrs"]
|
self.attrs = msg["attrs"]
|
||||||
self.strings = set(msg["strings"])
|
self.strings = set(msg["strings"])
|
||||||
lengths = numpy.fromstring(msg["lengths"], dtype="int32")
|
lengths = numpy.fromstring(msg["lengths"], dtype="int32")
|
||||||
|
|
|
@ -9,9 +9,9 @@ cimport numpy as np
|
||||||
import numpy
|
import numpy
|
||||||
import numpy.linalg
|
import numpy.linalg
|
||||||
import struct
|
import struct
|
||||||
import dill
|
import srsly
|
||||||
import msgpack
|
|
||||||
from thinc.neural.util import get_array_module, copy_array
|
from thinc.neural.util import get_array_module, copy_array
|
||||||
|
import srsly
|
||||||
|
|
||||||
from libc.string cimport memcpy, memset
|
from libc.string cimport memcpy, memset
|
||||||
from libc.math cimport sqrt
|
from libc.math cimport sqrt
|
||||||
|
@ -28,7 +28,7 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
|
||||||
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
|
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
|
||||||
from ..attrs cimport ENT_TYPE, SENT_START
|
from ..attrs cimport ENT_TYPE, SENT_START
|
||||||
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
|
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
|
||||||
from ..util import normalize_slice, is_json_serializable
|
from ..util import normalize_slice
|
||||||
from ..compat import is_config, copy_reg, pickle, basestring_
|
from ..compat import is_config, copy_reg, pickle, basestring_
|
||||||
from ..errors import deprecation_warning, models_warning, user_warning
|
from ..errors import deprecation_warning, models_warning, user_warning
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
|
@ -807,8 +807,8 @@ cdef class Doc:
|
||||||
}
|
}
|
||||||
if 'user_data' not in exclude and self.user_data:
|
if 'user_data' not in exclude and self.user_data:
|
||||||
user_data_keys, user_data_values = list(zip(*self.user_data.items()))
|
user_data_keys, user_data_values = list(zip(*self.user_data.items()))
|
||||||
serializers['user_data_keys'] = lambda: msgpack.dumps(user_data_keys)
|
serializers['user_data_keys'] = lambda: srsly.msgpack_dumps(user_data_keys)
|
||||||
serializers['user_data_values'] = lambda: msgpack.dumps(user_data_values)
|
serializers['user_data_values'] = lambda: srsly.msgpack_dumps(user_data_values)
|
||||||
|
|
||||||
return util.to_bytes(serializers, exclude)
|
return util.to_bytes(serializers, exclude)
|
||||||
|
|
||||||
|
@ -836,9 +836,8 @@ cdef class Doc:
|
||||||
# keys, we must have tuples. In values we just have to hope
|
# keys, we must have tuples. In values we just have to hope
|
||||||
# users don't mind getting a list instead of a tuple.
|
# users don't mind getting a list instead of a tuple.
|
||||||
if 'user_data' not in exclude and 'user_data_keys' in msg:
|
if 'user_data' not in exclude and 'user_data_keys' in msg:
|
||||||
user_data_keys = msgpack.loads(msg['user_data_keys'],
|
user_data_keys = srsly.msgpack_loads(msg['user_data_keys'], use_list=False)
|
||||||
use_list=False, raw=False)
|
user_data_values = srsly.msgpack_loads(msg['user_data_values'])
|
||||||
user_data_values = msgpack.loads(msg['user_data_values'], raw=False)
|
|
||||||
for key, value in zip(user_data_keys, user_data_values):
|
for key, value in zip(user_data_keys, user_data_values):
|
||||||
self.user_data[key] = value
|
self.user_data[key] = value
|
||||||
|
|
||||||
|
@ -996,7 +995,7 @@ cdef class Doc:
|
||||||
if not self.has_extension(attr):
|
if not self.has_extension(attr):
|
||||||
raise ValueError(Errors.E106.format(attr=attr, opts=underscore))
|
raise ValueError(Errors.E106.format(attr=attr, opts=underscore))
|
||||||
value = self._.get(attr)
|
value = self._.get(attr)
|
||||||
if not is_json_serializable(value):
|
if not srsly.is_json_serializable(value):
|
||||||
raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
|
raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
|
||||||
data['_'][attr] = value
|
data['_'][attr] = value
|
||||||
return data
|
return data
|
||||||
|
@ -1062,11 +1061,11 @@ def pickle_doc(doc):
|
||||||
bytes_data = doc.to_bytes(vocab=False, user_data=False)
|
bytes_data = doc.to_bytes(vocab=False, user_data=False)
|
||||||
hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks,
|
hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks,
|
||||||
doc.user_token_hooks)
|
doc.user_token_hooks)
|
||||||
return (unpickle_doc, (doc.vocab, dill.dumps(hooks_and_data), bytes_data))
|
return (unpickle_doc, (doc.vocab, srsly.pickle_dumps(hooks_and_data), bytes_data))
|
||||||
|
|
||||||
|
|
||||||
def unpickle_doc(vocab, hooks_and_data, bytes_data):
|
def unpickle_doc(vocab, hooks_and_data, bytes_data):
|
||||||
user_data, doc_hooks, span_hooks, token_hooks = dill.loads(hooks_and_data)
|
user_data, doc_hooks, span_hooks, token_hooks = srsly.pickle_loads(hooks_and_data)
|
||||||
|
|
||||||
doc = Doc(vocab, user_data=user_data).from_bytes(bytes_data,
|
doc = Doc(vocab, user_data=user_data).from_bytes(bytes_data,
|
||||||
exclude='user_data')
|
exclude='user_data')
|
||||||
|
|
|
@ -2,7 +2,6 @@
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import ujson
|
|
||||||
import pkg_resources
|
import pkg_resources
|
||||||
import importlib
|
import importlib
|
||||||
import regex as re
|
import regex as re
|
||||||
|
@ -12,21 +11,15 @@ from collections import OrderedDict
|
||||||
from thinc.neural._classes.model import Model
|
from thinc.neural._classes.model import Model
|
||||||
from thinc.neural.ops import NumpyOps
|
from thinc.neural.ops import NumpyOps
|
||||||
import functools
|
import functools
|
||||||
import cytoolz
|
|
||||||
import itertools
|
import itertools
|
||||||
import numpy.random
|
import numpy.random
|
||||||
|
import srsly
|
||||||
|
|
||||||
from .symbols import ORTH
|
from .symbols import ORTH
|
||||||
from .compat import cupy, CudaStream, path2str, basestring_, unicode_
|
from .compat import cupy, CudaStream, path2str, basestring_, unicode_
|
||||||
from .compat import import_file, json_dumps
|
from .compat import import_file
|
||||||
from .errors import Errors
|
from .errors import Errors
|
||||||
|
|
||||||
# Import these directly from Thinc, so that we're sure we always have the
|
|
||||||
# same version.
|
|
||||||
from thinc.neural._classes.model import msgpack # noqa: F401
|
|
||||||
from thinc.neural._classes.model import msgpack_numpy # noqa: F401
|
|
||||||
|
|
||||||
|
|
||||||
LANGUAGES = {}
|
LANGUAGES = {}
|
||||||
_data_path = Path(__file__).parent / "data"
|
_data_path = Path(__file__).parent / "data"
|
||||||
|
@ -185,7 +178,7 @@ def get_model_meta(path):
|
||||||
meta_path = model_path / "meta.json"
|
meta_path = model_path / "meta.json"
|
||||||
if not meta_path.is_file():
|
if not meta_path.is_file():
|
||||||
raise IOError(Errors.E053.format(path=meta_path))
|
raise IOError(Errors.E053.format(path=meta_path))
|
||||||
meta = read_json(meta_path)
|
meta = srsly.read_json(meta_path)
|
||||||
for setting in ["lang", "name", "version"]:
|
for setting in ["lang", "name", "version"]:
|
||||||
if setting not in meta or not meta[setting]:
|
if setting not in meta or not meta[setting]:
|
||||||
raise ValueError(Errors.E054.format(setting=setting))
|
raise ValueError(Errors.E054.format(setting=setting))
|
||||||
|
@ -409,7 +402,7 @@ def minibatch(items, size=8):
|
||||||
items = iter(items)
|
items = iter(items)
|
||||||
while True:
|
while True:
|
||||||
batch_size = next(size_)
|
batch_size = next(size_)
|
||||||
batch = list(cytoolz.take(int(batch_size), items))
|
batch = list(itertools.islice(items, int(batch_size)))
|
||||||
if len(batch) == 0:
|
if len(batch) == 0:
|
||||||
break
|
break
|
||||||
yield list(batch)
|
yield list(batch)
|
||||||
|
@ -529,74 +522,16 @@ def itershuffle(iterable, bufsize=1000):
|
||||||
raise StopIteration
|
raise StopIteration
|
||||||
|
|
||||||
|
|
||||||
def read_json(location):
|
|
||||||
"""Open and load JSON from file.
|
|
||||||
|
|
||||||
location (Path): Path to JSON file.
|
|
||||||
RETURNS (dict): Loaded JSON content.
|
|
||||||
"""
|
|
||||||
location = ensure_path(location)
|
|
||||||
with location.open("r", encoding="utf8") as f:
|
|
||||||
return ujson.load(f)
|
|
||||||
|
|
||||||
|
|
||||||
def write_json(file_path, contents):
|
|
||||||
"""Create a .json file and dump contents.
|
|
||||||
|
|
||||||
file_path (unicode / Path): The path to the output file.
|
|
||||||
contents: The JSON-serializable contents to output.
|
|
||||||
"""
|
|
||||||
with Path(file_path).open("w", encoding="utf8") as f:
|
|
||||||
f.write(json_dumps(contents))
|
|
||||||
|
|
||||||
|
|
||||||
def read_jsonl(file_path):
|
|
||||||
"""Read a .jsonl file and yield its contents line by line.
|
|
||||||
|
|
||||||
file_path (unicode / Path): The file path.
|
|
||||||
YIELDS: The loaded JSON contents of each line.
|
|
||||||
"""
|
|
||||||
with Path(file_path).open("r", encoding="utf8") as f:
|
|
||||||
for line in f:
|
|
||||||
try: # hack to handle broken jsonl
|
|
||||||
yield ujson.loads(line.strip())
|
|
||||||
except ValueError:
|
|
||||||
continue
|
|
||||||
|
|
||||||
|
|
||||||
def write_jsonl(file_path, lines):
|
|
||||||
"""Create a .jsonl file and dump contents.
|
|
||||||
|
|
||||||
file_path (unicode / Path): The path to the output file.
|
|
||||||
lines (list): The JSON-serializable contents of each line.
|
|
||||||
"""
|
|
||||||
data = [json_dumps(line) for line in lines]
|
|
||||||
with Path(file_path).open("w", encoding="utf-8") as f:
|
|
||||||
f.write("\n".join(data))
|
|
||||||
|
|
||||||
|
|
||||||
def is_json_serializable(obj):
|
|
||||||
"""Check if a Python object is JSON-serializable."""
|
|
||||||
if hasattr(obj, "__call__"):
|
|
||||||
# Check this separately here to prevent infinite recursions
|
|
||||||
return False
|
|
||||||
try:
|
|
||||||
ujson.dumps(obj)
|
|
||||||
return True
|
|
||||||
except TypeError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def to_bytes(getters, exclude):
|
def to_bytes(getters, exclude):
|
||||||
serialized = OrderedDict()
|
serialized = OrderedDict()
|
||||||
for key, getter in getters.items():
|
for key, getter in getters.items():
|
||||||
if key not in exclude:
|
if key not in exclude:
|
||||||
serialized[key] = getter()
|
serialized[key] = getter()
|
||||||
return msgpack.dumps(serialized, use_bin_type=True)
|
return srsly.msgpack_dumps(serialized)
|
||||||
|
|
||||||
|
|
||||||
def from_bytes(bytes_data, setters, exclude):
|
def from_bytes(bytes_data, setters, exclude):
|
||||||
msg = msgpack.loads(bytes_data, raw=False)
|
msg = srsly.msgpack_loads(bytes_data)
|
||||||
for key, setter in setters.items():
|
for key, setter in setters.items():
|
||||||
if key not in exclude and key in msg:
|
if key not in exclude and key in msg:
|
||||||
setter(msg[key])
|
setter(msg[key])
|
||||||
|
|
|
@ -4,9 +4,7 @@ from __future__ import unicode_literals
|
||||||
import functools
|
import functools
|
||||||
import numpy
|
import numpy
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
import srsly
|
||||||
from .util import msgpack
|
|
||||||
from .util import msgpack_numpy
|
|
||||||
|
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
from thinc.neural.util import get_array_module
|
from thinc.neural.util import get_array_module
|
||||||
|
@ -353,7 +351,7 @@ cdef class Vectors:
|
||||||
save_array = lambda arr, file_: xp.save(file_, arr)
|
save_array = lambda arr, file_: xp.save(file_, arr)
|
||||||
serializers = OrderedDict((
|
serializers = OrderedDict((
|
||||||
('vectors', lambda p: save_array(self.data, p.open('wb'))),
|
('vectors', lambda p: save_array(self.data, p.open('wb'))),
|
||||||
('key2row', lambda p: msgpack.dump(self.key2row, p.open('wb')))
|
('key2row', lambda p: srsly.write_msgpack(p, self.key2row))
|
||||||
))
|
))
|
||||||
return util.to_disk(path, serializers, exclude)
|
return util.to_disk(path, serializers, exclude)
|
||||||
|
|
||||||
|
@ -366,8 +364,7 @@ cdef class Vectors:
|
||||||
"""
|
"""
|
||||||
def load_key2row(path):
|
def load_key2row(path):
|
||||||
if path.exists():
|
if path.exists():
|
||||||
with path.open('rb') as file_:
|
self.key2row = srsly.read_msgpack(path)
|
||||||
self.key2row = msgpack.load(file_)
|
|
||||||
for key, row in self.key2row.items():
|
for key, row in self.key2row.items():
|
||||||
if self._unset.count(row):
|
if self._unset.count(row):
|
||||||
self._unset.erase(self._unset.find(row))
|
self._unset.erase(self._unset.find(row))
|
||||||
|
@ -401,9 +398,9 @@ cdef class Vectors:
|
||||||
if hasattr(self.data, 'to_bytes'):
|
if hasattr(self.data, 'to_bytes'):
|
||||||
return self.data.to_bytes()
|
return self.data.to_bytes()
|
||||||
else:
|
else:
|
||||||
return msgpack.dumps(self.data)
|
return srsly.msgpack_dumps(self.data)
|
||||||
serializers = OrderedDict((
|
serializers = OrderedDict((
|
||||||
('key2row', lambda: msgpack.dumps(self.key2row)),
|
('key2row', lambda: srsly.msgpack_dumps(self.key2row)),
|
||||||
('vectors', serialize_weights)
|
('vectors', serialize_weights)
|
||||||
))
|
))
|
||||||
return util.to_bytes(serializers, exclude)
|
return util.to_bytes(serializers, exclude)
|
||||||
|
@ -419,10 +416,10 @@ cdef class Vectors:
|
||||||
if hasattr(self.data, 'from_bytes'):
|
if hasattr(self.data, 'from_bytes'):
|
||||||
self.data.from_bytes()
|
self.data.from_bytes()
|
||||||
else:
|
else:
|
||||||
self.data = msgpack.loads(b)
|
self.data = srsly.msgpack_loads(b)
|
||||||
|
|
||||||
deserializers = OrderedDict((
|
deserializers = OrderedDict((
|
||||||
('key2row', lambda b: self.key2row.update(msgpack.loads(b))),
|
('key2row', lambda b: self.key2row.update(srsly.msgpack_loads(b))),
|
||||||
('vectors', deserialize_weights)
|
('vectors', deserialize_weights)
|
||||||
))
|
))
|
||||||
util.from_bytes(data, deserializers, exclude)
|
util.from_bytes(data, deserializers, exclude)
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
import dill
|
import srsly
|
||||||
|
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from thinc.neural.util import get_array_module
|
from thinc.neural.util import get_array_module
|
||||||
|
@ -513,7 +513,7 @@ def pickle_vocab(vocab):
|
||||||
morph = vocab.morphology
|
morph = vocab.morphology
|
||||||
length = vocab.length
|
length = vocab.length
|
||||||
data_dir = vocab.data_dir
|
data_dir = vocab.data_dir
|
||||||
lex_attr_getters = dill.dumps(vocab.lex_attr_getters)
|
lex_attr_getters = srsly.pickle_dumps(vocab.lex_attr_getters)
|
||||||
lexemes_data = vocab.lexemes_to_bytes()
|
lexemes_data = vocab.lexemes_to_bytes()
|
||||||
return (unpickle_vocab,
|
return (unpickle_vocab,
|
||||||
(sstore, vectors, morph, data_dir, lex_attr_getters, lexemes_data, length))
|
(sstore, vectors, morph, data_dir, lex_attr_getters, lexemes_data, length))
|
||||||
|
@ -527,7 +527,7 @@ def unpickle_vocab(sstore, vectors, morphology, data_dir,
|
||||||
vocab.strings = sstore
|
vocab.strings = sstore
|
||||||
vocab.morphology = morphology
|
vocab.morphology = morphology
|
||||||
vocab.data_dir = data_dir
|
vocab.data_dir = data_dir
|
||||||
vocab.lex_attr_getters = dill.loads(lex_attr_getters)
|
vocab.lex_attr_getters = srsly.pickle_loads(lex_attr_getters)
|
||||||
vocab.lexemes_from_bytes(lexemes_data)
|
vocab.lexemes_from_bytes(lexemes_data)
|
||||||
vocab.length = length
|
vocab.length = length
|
||||||
return vocab
|
return vocab
|
||||||
|
|
|
@ -9,10 +9,9 @@ p
|
||||||
| underscore, e.e #[code unicode_].
|
| underscore, e.e #[code unicode_].
|
||||||
|
|
||||||
+aside-code("Example").
|
+aside-code("Example").
|
||||||
from spacy.compat import unicode_, json_dumps
|
from spacy.compat import unicode_
|
||||||
|
|
||||||
compatible_unicode = unicode_('hello world')
|
compatible_unicode = unicode_('hello world')
|
||||||
compatible_json = json_dumps({'key': 'value'})
|
|
||||||
|
|
||||||
+table(["Name", "Python 2", "Python 3"])
|
+table(["Name", "Python 2", "Python 3"])
|
||||||
+row
|
+row
|
||||||
|
@ -35,11 +34,6 @@ p
|
||||||
+cell #[code raw_input]
|
+cell #[code raw_input]
|
||||||
+cell #[code input]
|
+cell #[code input]
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code compat.json_dumps]
|
|
||||||
+cell #[code ujson.dumps] with #[code .decode('utf8')]
|
|
||||||
+cell #[code ujson.dumps]
|
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code compat.path2str]
|
+cell #[code compat.path2str]
|
||||||
+cell #[code str(path)] with #[code .decode('utf8')]
|
+cell #[code str(path)] with #[code .decode('utf8')]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user