mirror of
synced 2025-03-28 22:04:13 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
@ -292,10 +292,9 @@ for example to show more specific error messages, you can use the `is_config()`
helper function.
from .compat import unicode_, json_dumps, is_config
from .compat import unicode_, is_config
compatible_unicode = unicode_('hello world')
compatible_json = json_dumps({'key': 'value'})
if is_config(windows=True, python2=True):
print("You are using Python 2 on Windows.")
@ -5,7 +5,7 @@ dist/spacy.pex : spacy/*.py* spacy/*/*.py*
python3.6 -m venv env3.6
source env3.6/bin/activate
env3.6/bin/pip install wheel
env3.6/bin/pip install -r requirements.txt
env3.6/bin/pip install -r requirements.txt --no-cache-dir --no-binary :all:
env3.6/bin/python setup.py build_ext --inplace
env3.6/bin/python setup.py sdist
env3.6/bin/python setup.py bdist_wheel
@ -1,3 +1,5 @@
<a href="https://explosion.ai"><img src="https://explosion.ai/assets/img/logo.svg" width="125" height="125" align="right" /></a>
# spaCy: Industrial-strength NLP
spaCy is a library for advanced Natural Language Processing in Python and
@ -3,7 +3,7 @@ from __future__ import unicode_literals
import bz2
import regex as re
import ujson
import srsly
import sys
import random
import datetime
@ -44,7 +44,7 @@ class Reddit(object):
line = line.strip()
if not line:
comment = ujson.loads(line)
comment = srsly.json_loads(line)
if self.is_valid(comment):
text = self.strip_tags(comment["body"])
yield {"text": text}
@ -75,7 +75,7 @@ class Reddit(object):
def main(path):
reddit = Reddit(path)
for comment in reddit:
if __name__ == "__main__":
@ -1,5 +1,12 @@
This example shows how to use an LSTM sentiment classification model trained using Keras in spaCy. spaCy splits the document into sentences, and each sentence is classified using the LSTM. The scores for the sentences are then aggregated to give the document score. This kind of hierarchical model is quite difficult in "pure" Keras or Tensorflow, but it's very effective. The Keras example on this dataset performs quite poorly, because it cuts off the documents so that they're a fixed size. This hurts review accuracy a lot, because people often summarise their rating in the final sentence
This example shows how to use an LSTM sentiment classification model trained
using Keras in spaCy. spaCy splits the document into sentences, and each
sentence is classified using the LSTM. The scores for the sentences are then
aggregated to give the document score. This kind of hierarchical model is quite
difficult in "pure" Keras or Tensorflow, but it's very effective. The Keras
example on this dataset performs quite poorly, because it cuts off the documents
so that they're a fixed size. This hurts review accuracy a lot, because people
often summarise their rating in the final sentence
spacy download en_vectors_web_lg
@ -25,9 +32,9 @@ import spacy
class SentimentAnalyser(object):
def load(cls, path, nlp, max_length=100):
with (path / 'config.json').open() as file_:
with (path / "config.json").open() as file_:
model = model_from_json(file_.read())
with (path / 'model').open('rb') as file_:
with (path / "model").open("rb") as file_:
lstm_weights = pickle.load(file_)
embeddings = get_embeddings(nlp.vocab)
model.set_weights([embeddings] + lstm_weights)
@ -69,12 +76,12 @@ def get_labelled_sentences(docs, doc_labels):
for sent in doc.sents:
return sentences, numpy.asarray(labels, dtype='int32')
return sentences, numpy.asarray(labels, dtype="int32")
def get_features(docs, max_length):
docs = list(docs)
Xs = numpy.zeros((len(docs), max_length), dtype='int32')
Xs = numpy.zeros((len(docs), max_length), dtype="int32")
for i, doc in enumerate(docs):
j = 0
for token in doc:
@ -89,16 +96,25 @@ def get_features(docs, max_length):
return Xs
def train(train_texts, train_labels, dev_texts, dev_labels,
lstm_shape, lstm_settings, lstm_optimizer, batch_size=100,
nb_epoch=5, by_sentence=True):
def train(
print("Loading spaCy")
nlp = spacy.load('en_vectors_web_lg')
nlp = spacy.load("en_vectors_web_lg")
embeddings = get_embeddings(nlp.vocab)
model = compile_lstm(embeddings, lstm_shape, lstm_settings)
print("Parsing texts...")
train_docs = list(nlp.pipe(train_texts))
dev_docs = list(nlp.pipe(dev_texts))
@ -106,10 +122,15 @@ def train(train_texts, train_labels, dev_texts, dev_labels,
train_docs, train_labels = get_labelled_sentences(train_docs, train_labels)
dev_docs, dev_labels = get_labelled_sentences(dev_docs, dev_labels)
train_X = get_features(train_docs, lstm_shape['max_length'])
dev_X = get_features(dev_docs, lstm_shape['max_length'])
model.fit(train_X, train_labels, validation_data=(dev_X, dev_labels),
epochs=nb_epoch, batch_size=batch_size)
train_X = get_features(train_docs, lstm_shape["max_length"])
dev_X = get_features(dev_docs, lstm_shape["max_length"])
validation_data=(dev_X, dev_labels),
return model
@ -119,19 +140,28 @@ def compile_lstm(embeddings, shape, settings):
model.add(TimeDistributed(Dense(shape['nr_hidden'], use_bias=False)))
model.add(Dense(shape['nr_class'], activation='sigmoid'))
model.compile(optimizer=Adam(lr=settings['lr']), loss='binary_crossentropy',
model.add(TimeDistributed(Dense(shape["nr_hidden"], use_bias=False)))
model.add(Dense(shape["nr_class"], activation="sigmoid"))
return model
@ -140,8 +170,8 @@ def get_embeddings(vocab):
def evaluate(model_dir, texts, labels, max_length=100):
nlp = spacy.load('en_vectors_web_lg')
nlp = spacy.load("en_vectors_web_lg")
nlp.add_pipe(SentimentAnalyser.load(model_dir, nlp, max_length=max_length))
correct = 0
@ -154,7 +184,7 @@ def evaluate(model_dir, texts, labels, max_length=100):
def read_data(data_dir, limit=0):
examples = []
for subdir, label in (('pos', 1), ('neg', 0)):
for subdir, label in (("pos", 1), ("neg", 0)):
for filename in (data_dir / subdir).iterdir():
with filename.open() as file_:
text = file_.read()
@ -162,7 +192,7 @@ def read_data(data_dir, limit=0):
if limit >= 1:
examples = examples[:limit]
return zip(*examples) # Unzips into two lists
return zip(*examples) # Unzips into two lists
@ -176,13 +206,21 @@ def read_data(data_dir, limit=0):
learn_rate=("Learn rate", "option", "e", float),
nb_epoch=("Number of training epochs", "option", "i", int),
batch_size=("Size of minibatches for training LSTM", "option", "b", int),
nr_examples=("Limit to N examples", "option", "n", int)
nr_examples=("Limit to N examples", "option", "n", int),
def main(model_dir=None, train_dir=None, dev_dir=None,
nr_hidden=64, max_length=100, # Shape
dropout=0.5, learn_rate=0.001, # General NN config
nb_epoch=5, batch_size=256, nr_examples=-1): # Training params
def main(
max_length=100, # Shape
learn_rate=0.001, # General NN config
): # Training params
if model_dir is not None:
model_dir = pathlib.Path(model_dir)
if train_dir is None or dev_dir is None:
@ -204,20 +242,26 @@ def main(model_dir=None, train_dir=None, dev_dir=None,
dev_texts, dev_labels = zip(*imdb_data[1])
dev_texts, dev_labels = read_data(dev_dir, imdb_data, limit=nr_examples)
train_labels = numpy.asarray(train_labels, dtype='int32')
dev_labels = numpy.asarray(dev_labels, dtype='int32')
lstm = train(train_texts, train_labels, dev_texts, dev_labels,
{'nr_hidden': nr_hidden, 'max_length': max_length, 'nr_class': 1},
{'dropout': dropout, 'lr': learn_rate},
nb_epoch=nb_epoch, batch_size=batch_size)
train_labels = numpy.asarray(train_labels, dtype="int32")
dev_labels = numpy.asarray(dev_labels, dtype="int32")
lstm = train(
{"nr_hidden": nr_hidden, "max_length": max_length, "nr_class": 1},
{"dropout": dropout, "lr": learn_rate},
weights = lstm.get_weights()
if model_dir is not None:
with (model_dir / 'model').open('wb') as file_:
with (model_dir / "model").open("wb") as file_:
pickle.dump(weights[1:], file_)
with (model_dir / 'config.json').open('w') as file_:
with (model_dir / "config.json").open("w") as file_:
if __name__ == '__main__':
if __name__ == "__main__":
@ -15,14 +15,15 @@ import spacy
'Net income was $9.4 million compared to the prior year of $2.7 million.',
'Revenue exceeded twelve billion dollars, with a loss of $1b.',
"Net income was $9.4 million compared to the prior year of $2.7 million.",
"Revenue exceeded twelve billion dollars, with a loss of $1b.",
model=("Model to load (needs parser and NER)", "positional", None, str))
def main(model='en_core_web_sm'):
model=("Model to load (needs parser and NER)", "positional", None, str)
def main(model="en_core_web_sm"):
nlp = spacy.load(model)
print("Loaded model '%s'" % model)
print("Processing %d texts" % len(TEXTS))
@ -31,7 +32,7 @@ def main(model='en_core_web_sm'):
doc = nlp(text)
relations = extract_currency_relations(doc)
for r1, r2 in relations:
print('{:<10}\t{}\t{}'.format(r1.text, r2.ent_type_, r2.text))
print("{:<10}\t{}\t{}".format(r1.text, r2.ent_type_, r2.text))
def extract_currency_relations(doc):
@ -41,18 +42,18 @@ def extract_currency_relations(doc):
relations = []
for money in filter(lambda w: w.ent_type_ == 'MONEY', doc):
if money.dep_ in ('attr', 'dobj'):
subject = [w for w in money.head.lefts if w.dep_ == 'nsubj']
for money in filter(lambda w: w.ent_type_ == "MONEY", doc):
if money.dep_ in ("attr", "dobj"):
subject = [w for w in money.head.lefts if w.dep_ == "nsubj"]
if subject:
subject = subject[0]
relations.append((subject, money))
elif money.dep_ == 'pobj' and money.head.dep_ == 'prep':
elif money.dep_ == "pobj" and money.head.dep_ == "prep":
relations.append((money.head.head, money))
return relations
if __name__ == '__main__':
if __name__ == "__main__":
# Expected output:
@ -24,37 +24,39 @@ import plac
import spacy
model=("Model to load", "positional", None, str))
def main(model='en_core_web_sm'):
@plac.annotations(model=("Model to load", "positional", None, str))
def main(model="en_core_web_sm"):
nlp = spacy.load(model)
print("Loaded model '%s'" % model)
doc = nlp("displaCy uses CSS and JavaScript to show you how computers "
"understand language")
doc = nlp(
"displaCy uses CSS and JavaScript to show you how computers "
"understand language"
# The easiest way is to find the head of the subtree you want, and then use
# the `.subtree`, `.children`, `.lefts` and `.rights` iterators. `.subtree`
# is the one that does what you're asking for most directly:
for word in doc:
if word.dep_ in ('xcomp', 'ccomp'):
print(''.join(w.text_with_ws for w in word.subtree))
if word.dep_ in ("xcomp", "ccomp"):
print("".join(w.text_with_ws for w in word.subtree))
# It'd probably be better for `word.subtree` to return a `Span` object
# instead of a generator over the tokens. If you want the `Span` you can
# get it via the `.right_edge` and `.left_edge` properties. The `Span`
# object is nice because you can easily get a vector, merge it, etc.
for word in doc:
if word.dep_ in ('xcomp', 'ccomp'):
if word.dep_ in ("xcomp", "ccomp"):
subtree_span = doc[word.left_edge.i : word.right_edge.i + 1]
print(subtree_span.text, '|', subtree_span.root.text)
print(subtree_span.text, "|", subtree_span.root.text)
# You might also want to select a head, and then select a start and end
# position by walking along its children. You could then take the
# `.left_edge` and `.right_edge` of those tokens, and use it to calculate
# a span.
if __name__ == '__main__':
if __name__ == "__main__":
# Expected output:
@ -45,7 +45,7 @@ from __future__ import print_function, unicode_literals, division
from bz2 import BZ2File
import time
import plac
import ujson
import json
from spacy.matcher import PhraseMatcher
import spacy
@ -55,15 +55,15 @@ import spacy
patterns_loc=("Path to gazetteer", "positional", None, str),
text_loc=("Path to Reddit corpus file", "positional", None, str),
n=("Number of texts to read", "option", "n", int),
lang=("Language class to initialise", "option", "l", str))
def main(patterns_loc, text_loc, n=10000, lang='en'):
nlp = spacy.blank('en')
lang=("Language class to initialise", "option", "l", str),
def main(patterns_loc, text_loc, n=10000, lang="en"):
nlp = spacy.blank("en")
nlp.vocab.lex_attr_getters = {}
phrases = read_gazetteer(nlp.tokenizer, patterns_loc)
count = 0
t1 = time.time()
for ent_id, text in get_matches(nlp.tokenizer, phrases,
read_text(text_loc, n=n)):
for ent_id, text in get_matches(nlp.tokenizer, phrases, read_text(text_loc, n=n)):
count += 1
t2 = time.time()
print("%d docs in %.3f s. %d matches" % (n, (t2 - t1), count))
@ -71,8 +71,8 @@ def main(patterns_loc, text_loc, n=10000, lang='en'):
def read_gazetteer(tokenizer, loc, n=-1):
for i, line in enumerate(open(loc)):
data = ujson.loads(line.strip())
phrase = tokenizer(data['text'])
data = json.loads(line.strip())
phrase = tokenizer(data["text"])
for w in phrase:
_ = tokenizer.vocab[w.text]
if len(phrase) >= 2:
@ -82,15 +82,15 @@ def read_gazetteer(tokenizer, loc, n=-1):
def read_text(bz2_loc, n=10000):
with BZ2File(bz2_loc) as file_:
for i, line in enumerate(file_):
data = ujson.loads(line)
yield data['body']
data = json.loads(line)
yield data["body"]
if i >= n:
def get_matches(tokenizer, phrases, texts, max_length=6):
matcher = PhraseMatcher(tokenizer.vocab, max_length=max_length)
matcher.add('Phrase', None, *phrases)
matcher.add("Phrase", None, *phrases)
for text in texts:
doc = tokenizer(text)
for w in doc:
@ -100,10 +100,11 @@ def get_matches(tokenizer, phrases, texts, max_length=6):
yield (ent_id, doc[start:end].text)
if __name__ == '__main__':
if __name__ == "__main__":
if False:
import cProfile
import pstats
cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof")
s = pstats.Stats("Profile.prof")
@ -1,5 +1,5 @@
import numpy as np
import ujson as json
import json
from keras.utils import to_categorical
import plac
import sys
@ -32,7 +32,7 @@ def set_keras_backend(backend):
def train(train_loc, dev_loc, shape, settings):
@ -42,7 +42,7 @@ def train(train_loc, dev_loc, shape, settings):
print("Loading spaCy")
nlp = spacy.load('en_vectors_web_lg')
assert nlp.path is not None
print("Processing texts...")
train_X = create_dataset(nlp, train_texts1, train_texts2, 100, shape[0])
dev_X = create_dataset(nlp, dev_texts1, dev_texts2, 100, shape[0])
@ -57,7 +57,7 @@ def train(train_loc, dev_loc, shape, settings):
validation_data = (dev_X, dev_labels),
epochs = settings['nr_epoch'],
batch_size = settings['batch_size'])
if not (nlp.path / 'similarity').exists():
(nlp.path / 'similarity').mkdir()
print("Saving to", nlp.path / 'similarity')
@ -74,7 +74,7 @@ def evaluate(dev_loc, shape):
dev_texts1, dev_texts2, dev_labels = read_snli(dev_loc)
nlp = spacy.load('en_vectors_web_lg')
nlp.add_pipe(KerasSimilarityShim.load(nlp.path / 'similarity', nlp, shape[0]))
total = 0.
correct = 0.
for text1, text2, label in zip(dev_texts1, dev_texts2, dev_labels):
@ -119,33 +119,33 @@ def read_snli(path):
def create_dataset(nlp, texts, hypotheses, num_unk, max_length):
sents = texts + hypotheses
sents_as_ids = []
for sent in sents:
doc = nlp(sent)
word_ids = []
for i, token in enumerate(doc):
# skip odd spaces from tokenizer
if token.has_vector and token.vector_norm == 0:
if i > max_length:
if token.has_vector:
word_ids.append(token.rank + num_unk + 1)
# if we don't have a vector, pick an OOV entry
word_ids.append(token.rank % num_unk + 1)
word_ids.append(token.rank % num_unk + 1)
# there must be a simpler way of generating padded arrays from lists...
word_id_vec = np.zeros((max_length), dtype='int')
clipped_len = min(max_length, len(word_ids))
word_id_vec[:clipped_len] = word_ids[:clipped_len]
return [np.array(sents_as_ids[:len(texts)]), np.array(sents_as_ids[len(texts):])]
@ -169,7 +169,7 @@ def main(mode, train_loc, dev_loc,
batch_size = 1024,
nr_epoch = 10,
shape = (max_length, nr_hidden, 3)
settings = {
'lr': learn_rate,
@ -10,19 +10,19 @@ def build_model(vectors, shape, settings):
input1 = layers.Input(shape=(max_length,), dtype='int32', name='words1')
input2 = layers.Input(shape=(max_length,), dtype='int32', name='words2')
# embeddings (projected)
embed = create_embedding(vectors, max_length, nr_hidden)
a = embed(input1)
b = embed(input2)
# step 1: attend
F = create_feedforward(nr_hidden)
att_weights = layers.dot([F(a), F(b)], axes=-1)
G = create_feedforward(nr_hidden)
if settings['entail_dir'] == 'both':
norm_weights_a = layers.Lambda(normalizer(1))(att_weights)
norm_weights_b = layers.Lambda(normalizer(2))(att_weights)
@ -55,18 +55,18 @@ def build_model(vectors, shape, settings):
v1 = layers.TimeDistributed(G)(comp1)
v1_sum = layers.Lambda(sum_word)(v1)
concat = v1_sum
H = create_feedforward(nr_hidden)
out = H(concat)
out = layers.Dense(nr_class, activation='softmax')(out)
model = Model([input1, input2], out)
return model
@ -78,7 +78,7 @@ def create_embedding(vectors, max_length, projected_dim):
@ -77,7 +77,7 @@
"source": [
"import ujson as json\n",
"import json\n",
"from keras.utils import to_categorical\n",
"LABELS = {'entailment': 0, 'contradiction': 1, 'neutral': 2}\n",
@ -19,39 +19,40 @@ from pathlib import Path
output_dir=("Output directory for saved HTML", "positional", None, Path))
output_dir=("Output directory for saved HTML", "positional", None, Path)
def main(output_dir=None):
nlp = English() # start off with blank English class
Doc.set_extension('overlap', method=overlap_tokens)
doc1 = nlp(u"Peach emoji is where it has always been.")
doc2 = nlp(u"Peach is the superior emoji.")
Doc.set_extension("overlap", method=overlap_tokens)
doc1 = nlp("Peach emoji is where it has always been.")
doc2 = nlp("Peach is the superior emoji.")
print("Text 1:", doc1.text)
print("Text 2:", doc2.text)
print("Overlapping tokens:", doc1._.overlap(doc2))
Doc.set_extension('to_html', method=to_html)
doc = nlp(u"This is a sentence about Apple.")
Doc.set_extension("to_html", method=to_html)
doc = nlp("This is a sentence about Apple.")
# add entity manually for demo purposes, to make it work without a model
doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])]
doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings["ORG"])]
print("Text:", doc.text)
doc._.to_html(output=output_dir, style='ent')
doc._.to_html(output=output_dir, style="ent")
def to_html(doc, output='/tmp', style='dep'):
def to_html(doc, output="/tmp", style="dep"):
"""Doc method extension for saving the current state as a displaCy
# generate filename from first six non-punct tokens
file_name = '-'.join([w.text for w in doc[:6] if not w.is_punct]) + '.html'
file_name = "-".join([w.text for w in doc[:6] if not w.is_punct]) + ".html"
html = displacy.render(doc, style=style, page=True) # render markup
if output is not None:
output_path = Path(output)
if not output_path.exists():
output_file = Path(output) / file_name
output_file.open('w', encoding='utf-8').write(html) # save to file
print('Saved HTML to {}'.format(output_file))
output_file.open("w", encoding="utf-8").write(html) # save to file
print("Saved HTML to {}".format(output_file))
@ -67,7 +68,7 @@ def overlap_tokens(doc, other_doc):
return overlap
if __name__ == '__main__':
if __name__ == "__main__":
# Expected output:
@ -25,15 +25,19 @@ def main():
# and no model or pre-defined pipeline loaded.
nlp = English()
rest_countries = RESTCountriesComponent(nlp) # initialise component
nlp.add_pipe(rest_countries) # add it to the pipeline
doc = nlp(u"Some text about Colombia and the Czech Republic")
print('Pipeline', nlp.pipe_names) # pipeline contains component name
print('Doc has countries', doc._.has_country) # Doc contains countries
nlp.add_pipe(rest_countries) # add it to the pipeline
doc = nlp("Some text about Colombia and the Czech Republic")
print("Pipeline", nlp.pipe_names) # pipeline contains component name
print("Doc has countries", doc._.has_country) # Doc contains countries
for token in doc:
if token._.is_country:
print(token.text, token._.country_capital, token._.country_latlng,
token._.country_flag) # country data
print('Entities', [(e.text, e.label_) for e in doc.ents]) # entities
) # country data
print("Entities", [(e.text, e.label_) for e in doc.ents]) # entities
class RESTCountriesComponent(object):
@ -41,42 +45,42 @@ class RESTCountriesComponent(object):
the REST Countries API, merges country names into one token, assigns entity
labels and sets attributes on country tokens.
name = 'rest_countries' # component name, will show up in the pipeline
def __init__(self, nlp, label='GPE'):
name = "rest_countries" # component name, will show up in the pipeline
def __init__(self, nlp, label="GPE"):
"""Initialise the pipeline component. The shared nlp instance is used
to initialise the matcher with the shared vocab, get the label ID and
generate Doc objects as phrase match patterns.
# Make request once on initialisation and store the data
r = requests.get('https://restcountries.eu/rest/v2/all')
r = requests.get("https://restcountries.eu/rest/v2/all")
r.raise_for_status() # make sure requests raises an error if it fails
countries = r.json()
# Convert API response to dict keyed by country name for easy lookup
# This could also be extended using the alternative and foreign language
# names provided by the API
self.countries = {c['name']: c for c in countries}
self.countries = {c["name"]: c for c in countries}
self.label = nlp.vocab.strings[label] # get entity label ID
# Set up the PhraseMatcher with Doc patterns for each country name
patterns = [nlp(c) for c in self.countries.keys()]
self.matcher = PhraseMatcher(nlp.vocab)
self.matcher.add('COUNTRIES', None, *patterns)
self.matcher.add("COUNTRIES", None, *patterns)
# Register attribute on the Token. We'll be overwriting this based on
# the matches, so we're only setting a default value, not a getter.
# If no default value is set, it defaults to None.
Token.set_extension('is_country', default=False)
Token.set_extension('country_capital', default=False)
Token.set_extension('country_latlng', default=False)
Token.set_extension('country_flag', default=False)
Token.set_extension("is_country", default=False)
Token.set_extension("country_capital", default=False)
Token.set_extension("country_latlng", default=False)
Token.set_extension("country_flag", default=False)
# Register attributes on Doc and Span via a getter that checks if one of
# the contained tokens is set to is_country == True.
Doc.set_extension('has_country', getter=self.has_country)
Span.set_extension('has_country', getter=self.has_country)
Doc.set_extension("has_country", getter=self.has_country)
Span.set_extension("has_country", getter=self.has_country)
def __call__(self, doc):
"""Apply the pipeline component on a Doc object and modify it if matches
@ -93,10 +97,10 @@ class RESTCountriesComponent(object):
# Can be extended with other data returned by the API, like
# currencies, country code, flag, calling code etc.
for token in entity:
token._.set('is_country', True)
token._.set('country_capital', self.countries[entity.text]['capital'])
token._.set('country_latlng', self.countries[entity.text]['latlng'])
token._.set('country_flag', self.countries[entity.text]['flag'])
token._.set("is_country", True)
token._.set("country_capital", self.countries[entity.text]["capital"])
token._.set("country_latlng", self.countries[entity.text]["latlng"])
token._.set("country_flag", self.countries[entity.text]["flag"])
# Overwrite doc.ents and add entity – be careful not to replace!
doc.ents = list(doc.ents) + [entity]
for span in spans:
@ -111,10 +115,10 @@ class RESTCountriesComponent(object):
is a country. Since the getter is only called when we access the
attribute, we can refer to the Token's 'is_country' attribute here,
which is already set in the processing step."""
return any([t._.get('is_country') for t in tokens])
return any([t._.get("is_country") for t in tokens])
if __name__ == '__main__':
if __name__ == "__main__":
# Expected output:
@ -20,23 +20,24 @@ from spacy.tokens import Doc, Span, Token
text=("Text to process", "positional", None, str),
companies=("Names of technology companies", "positional", None, str))
companies=("Names of technology companies", "positional", None, str),
def main(text="Alphabet Inc. is the company behind Google.", *companies):
# For simplicity, we start off with only the blank English Language class
# and no model or pre-defined pipeline loaded.
nlp = English()
if not companies: # set default companies if none are set via args
companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple'] # etc.
companies = ["Alphabet Inc.", "Google", "Netflix", "Apple"] # etc.
component = TechCompanyRecognizer(nlp, companies) # initialise component
nlp.add_pipe(component, last=True) # add last to the pipeline
doc = nlp(text)
print('Pipeline', nlp.pipe_names) # pipeline contains component name
print('Tokens', [t.text for t in doc]) # company names from the list are merged
print('Doc has_tech_org', doc._.has_tech_org) # Doc contains tech orgs
print('Token 0 is_tech_org', doc[0]._.is_tech_org) # "Alphabet Inc." is a tech org
print('Token 1 is_tech_org', doc[1]._.is_tech_org) # "is" is not
print('Entities', [(e.text, e.label_) for e in doc.ents]) # all orgs are entities
print("Pipeline", nlp.pipe_names) # pipeline contains component name
print("Tokens", [t.text for t in doc]) # company names from the list are merged
print("Doc has_tech_org", doc._.has_tech_org) # Doc contains tech orgs
print("Token 0 is_tech_org", doc[0]._.is_tech_org) # "Alphabet Inc." is a tech org
print("Token 1 is_tech_org", doc[1]._.is_tech_org) # "is" is not
print("Entities", [(e.text, e.label_) for e in doc.ents]) # all orgs are entities
class TechCompanyRecognizer(object):
@ -45,9 +46,10 @@ class TechCompanyRecognizer(object):
labelled as ORG and their spans are merged into one token. Additionally,
._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token
name = 'tech_companies' # component name, will show up in the pipeline
def __init__(self, nlp, companies=tuple(), label='ORG'):
name = "tech_companies" # component name, will show up in the pipeline
def __init__(self, nlp, companies=tuple(), label="ORG"):
"""Initialise the pipeline component. The shared nlp instance is used
to initialise the matcher with the shared vocab, get the label ID and
generate Doc objects as phrase match patterns.
@ -58,16 +60,16 @@ class TechCompanyRecognizer(object):
# so even if the list of companies is long, it's very efficient
patterns = [nlp(org) for org in companies]
self.matcher = PhraseMatcher(nlp.vocab)
self.matcher.add('TECH_ORGS', None, *patterns)
self.matcher.add("TECH_ORGS", None, *patterns)
# Register attribute on the Token. We'll be overwriting this based on
# the matches, so we're only setting a default value, not a getter.
Token.set_extension('is_tech_org', default=False)
Token.set_extension("is_tech_org", default=False)
# Register attributes on Doc and Span via a getter that checks if one of
# the contained tokens is set to is_tech_org == True.
Doc.set_extension('has_tech_org', getter=self.has_tech_org)
Span.set_extension('has_tech_org', getter=self.has_tech_org)
Doc.set_extension("has_tech_org", getter=self.has_tech_org)
Span.set_extension("has_tech_org", getter=self.has_tech_org)
def __call__(self, doc):
"""Apply the pipeline component on a Doc object and modify it if matches
@ -82,7 +84,7 @@ class TechCompanyRecognizer(object):
# Set custom attribute on each token of the entity
for token in entity:
token._.set('is_tech_org', True)
token._.set("is_tech_org", True)
# Overwrite doc.ents and add entity – be careful not to replace!
doc.ents = list(doc.ents) + [entity]
for span in spans:
@ -97,10 +99,10 @@ class TechCompanyRecognizer(object):
is a tech org. Since the getter is only called when we access the
attribute, we can refer to the Token's 'is_tech_org' attribute here,
which is already set in the processing step."""
return any([t._.get('is_tech_org') for t in tokens])
return any([t._.get("is_tech_org") for t in tokens])
if __name__ == '__main__':
if __name__ == "__main__":
# Expected output:
@ -1,4 +1,4 @@
'''Example of adding a pipeline component to prohibit sentence boundaries
"""Example of adding a pipeline component to prohibit sentence boundaries
before certain tokens.
What we do is write to the token.is_sent_start attribute, which
@ -10,16 +10,18 @@ should also improve the parse quality.
The specific example here is drawn from https://github.com/explosion/spaCy/issues/2627
Other versions of the model may not make the original mistake, so the specific
example might not be apt for future versions.
import plac
import spacy
def prevent_sentence_boundaries(doc):
for token in doc:
if not can_be_sentence_start(token):
token.is_sent_start = False
return doc
def can_be_sentence_start(token):
if token.i == 0:
return True
@ -32,17 +34,18 @@ def can_be_sentence_start(token):
return False
def main():
nlp = spacy.load('en_core_web_lg')
nlp = spacy.load("en_core_web_lg")
raw_text = "Been here and I'm loving it."
doc = nlp(raw_text)
sentences = [sent.string.strip() for sent in doc.sents]
nlp.add_pipe(prevent_sentence_boundaries, before='parser')
nlp.add_pipe(prevent_sentence_boundaries, before="parser")
doc = nlp(raw_text)
sentences = [sent.string.strip() for sent in doc.sents]
if __name__ == '__main__':
if __name__ == "__main__":
@ -1,10 +1,11 @@
'''Demonstrate adding a rule-based component that forces some tokens to not
"""Demonstrate adding a rule-based component that forces some tokens to not
be entities, before the NER tagger is applied. This is used to hotfix the issue
in https://github.com/explosion/spaCy/issues/2870 , present as of spaCy v2.0.16.
import spacy
from spacy.attrs import ENT_IOB
def fix_space_tags(doc):
ent_iobs = doc.to_array([ENT_IOB])
for i, token in enumerate(doc):
@ -14,14 +15,16 @@ def fix_space_tags(doc):
doc.from_array([ENT_IOB], ent_iobs.reshape((len(doc), 1)))
return doc
def main():
nlp = spacy.load('en_core_web_sm')
text = u'''This is some crazy test where I dont need an Apple Watch to make things bug'''
doc = nlp(text)
print('Before', doc.ents)
nlp.add_pipe(fix_space_tags, name='fix-ner', before='ner')
doc = nlp(text)
print('After', doc.ents)
if __name__ == '__main__':
def main():
nlp = spacy.load("en_core_web_sm")
text = u"""This is some crazy test where I dont need an Apple Watch to make things bug"""
doc = nlp(text)
print("Before", doc.ents)
nlp.add_pipe(fix_space_tags, name="fix-ner", before="ner")
doc = nlp(text)
print("After", doc.ents)
if __name__ == "__main__":
@ -9,6 +9,7 @@ built-in dataset loader.
Compatible with: spaCy v2.0.0+
from __future__ import print_function, unicode_literals
from toolz import partition_all
from pathlib import Path
from joblib import Parallel, delayed
@ -22,9 +23,9 @@ import spacy
model=("Model name (needs tagger)", "positional", None, str),
n_jobs=("Number of workers", "option", "n", int),
batch_size=("Batch-size for each process", "option", "b", int),
limit=("Limit of entries from the dataset", "option", "l", int))
def main(output_dir, model='en_core_web_sm', n_jobs=4, batch_size=1000,
limit=("Limit of entries from the dataset", "option", "l", int),
def main(output_dir, model="en_core_web_sm", n_jobs=4, batch_size=1000, limit=10000):
nlp = spacy.load(model) # load spaCy model
print("Loaded model '%s'" % model)
if not output_dir.exists():
@ -37,42 +38,44 @@ def main(output_dir, model='en_core_web_sm', n_jobs=4, batch_size=1000,
partitions = partition_all(batch_size, texts)
executor = Parallel(n_jobs=n_jobs)
do = delayed(transform_texts)
tasks = (do(nlp, i, batch, output_dir)
for i, batch in enumerate(partitions))
tasks = (do(nlp, i, batch, output_dir) for i, batch in enumerate(partitions))
def transform_texts(nlp, batch_id, texts, output_dir):
out_path = Path(output_dir) / ('%d.txt' % batch_id)
out_path = Path(output_dir) / ("%d.txt" % batch_id)
if out_path.exists(): # return None in case same batch is called again
return None
print('Processing batch', batch_id)
with out_path.open('w', encoding='utf8') as f:
print("Processing batch", batch_id)
with out_path.open("w", encoding="utf8") as f:
for doc in nlp.pipe(texts):
f.write(' '.join(represent_word(w) for w in doc if not w.is_space))
print('Saved {} texts to {}.txt'.format(len(texts), batch_id))
f.write(" ".join(represent_word(w) for w in doc if not w.is_space))
print("Saved {} texts to {}.txt".format(len(texts), batch_id))
def represent_word(word):
text = word.text
# True-case, i.e. try to normalize sentence-initial capitals.
# Only do this if the lower-cased form is more probable.
if text.istitle() and is_sent_begin(word) \
and word.prob < word.doc.vocab[text.lower()].prob:
if (
and is_sent_begin(word)
and word.prob < word.doc.vocab[text.lower()].prob
text = text.lower()
return text + '|' + word.tag_
return text + "|" + word.tag_
def is_sent_begin(word):
if word.i == 0:
return True
elif word.i >= 2 and word.nbor(-1).text in ('.', '!', '?', '...'):
elif word.i >= 2 and word.nbor(-1).text in (".", "!", "?", "..."):
return True
return False
if __name__ == '__main__':
if __name__ == "__main__":
@ -1,6 +1,6 @@
'''Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
"""Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
.conllu format for development data, allowing the official scorer to be used.
from __future__ import unicode_literals
import plac
import tqdm
@ -22,7 +22,6 @@ from spacy.matcher import Matcher
import itertools
import random
import numpy.random
import cytoolz
import conll17_ud_eval
@ -35,6 +34,7 @@ spacy.lang.ja.Japanese.Defaults.use_janome = False
def minibatch_by_words(items, size=5000):
if isinstance(size, int):
@ -59,21 +59,31 @@ def minibatch_by_words(items, size=5000):
# Data reading #
space_re = re.compile('\s+')
def split_text(text):
return [space_re.sub(' ', par.strip()) for par in text.split('\n\n')]
space_re = re.compile("\s+")
def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False,
max_doc_length=None, limit=None):
'''Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
def split_text(text):
return [space_re.sub(" ", par.strip()) for par in text.split("\n\n")]
def read_data(
"""Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
include Doc objects created using nlp.make_doc and then aligned against
the gold-standard sequences. If oracle_segments=True, include Doc objects
created from the gold-standard segments. At least one must be True.'''
created from the gold-standard segments. At least one must be True."""
if not raw_text and not oracle_segments:
raise ValueError("At least one of raw_text or oracle_segments must be True")
paragraphs = split_text(text_file.read())
@ -87,22 +97,21 @@ def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False,
for cs in cd:
sent = defaultdict(list)
for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs:
if '.' in id_:
if "." in id_:
if '-' in id_:
if "-" in id_:
id_ = int(id_)-1
head = int(head)-1 if head != '0' else id_
sent['deps'].append('ROOT' if dep == 'root' else dep)
sent['spaces'].append(space_after == '_')
sent['entities'] = ['-'] * len(sent['words'])
sent['heads'], sent['deps'] = projectivize(sent['heads'],
id_ = int(id_) - 1
head = int(head) - 1 if head != "0" else id_
sent["deps"].append("ROOT" if dep == "root" else dep)
sent["spaces"].append(space_after == "_")
sent["entities"] = ["-"] * len(sent["words"])
sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"])
if oracle_segments:
docs.append(Doc(nlp.vocab, words=sent['words'], spaces=sent['spaces']))
docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
golds.append(GoldParse(docs[-1], **sent))
@ -128,18 +137,18 @@ def read_conllu(file_):
sent = []
doc = []
for line in file_:
if line.startswith('# newdoc'):
if line.startswith("# newdoc"):
if doc:
doc = []
elif line.startswith('#'):
elif line.startswith("#"):
elif not line.strip():
if sent:
sent = []
if len(sent[-1]) != 10:
raise ValueError
@ -154,25 +163,29 @@ def _make_gold(nlp, text, sent_annots):
# Flatten the conll annotations, and adjust the head indices
flat = defaultdict(list)
for sent in sent_annots:
flat['heads'].extend(len(flat['words'])+head for head in sent['heads'])
for field in ['words', 'tags', 'deps', 'entities', 'spaces']:
flat["heads"].extend(len(flat["words"]) + head for head in sent["heads"])
for field in ["words", "tags", "deps", "entities", "spaces"]:
# Construct text if necessary
assert len(flat['words']) == len(flat['spaces'])
assert len(flat["words"]) == len(flat["spaces"])
if text is None:
text = ''.join(word+' '*space for word, space in zip(flat['words'], flat['spaces']))
text = "".join(
word + " " * space for word, space in zip(flat["words"], flat["spaces"])
doc = nlp.make_doc(text)
gold = GoldParse(doc, **flat)
return doc, gold
# Data transforms for spaCy #
def golds_to_gold_tuples(docs, golds):
'''Get out the annoying 'tuples' format used by begin_training, given the
GoldParse objects.'''
"""Get out the annoying 'tuples' format used by begin_training, given the
GoldParse objects."""
tuples = []
for doc, gold in zip(docs, golds):
text = doc.text
@ -186,15 +199,16 @@ def golds_to_gold_tuples(docs, golds):
# Evaluation #
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
with text_loc.open('r', encoding='utf8') as text_file:
with text_loc.open("r", encoding="utf8") as text_file:
texts = split_text(text_file.read())
docs = list(nlp.pipe(texts))
with sys_loc.open('w', encoding='utf8') as out_file:
with sys_loc.open("w", encoding="utf8") as out_file:
write_conllu(docs, out_file)
with gold_loc.open('r', encoding='utf8') as gold_file:
with gold_loc.open("r", encoding="utf8") as gold_file:
gold_ud = conll17_ud_eval.load_conllu(gold_file)
with sys_loc.open('r', encoding='utf8') as sys_file:
with sys_loc.open("r", encoding="utf8") as sys_file:
sys_ud = conll17_ud_eval.load_conllu(sys_file)
scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
return scores
@ -202,10 +216,10 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
def write_conllu(docs, file_):
merger = Matcher(docs[0].vocab)
merger.add('SUBTOK', None, [{'DEP': 'subtok', 'op': '+'}])
merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
for i, doc in enumerate(docs):
matches = merger(doc)
spans = [doc[start:end+1] for _, start, end in matches]
spans = [doc[start : end + 1] for _, start, end in matches]
offsets = [(span.start_char, span.end_char) for span in spans]
for start_char, end_char in offsets:
doc.merge(start_char, end_char)
@ -214,58 +228,73 @@ def write_conllu(docs, file_):
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
file_.write("# text = {text}\n".format(text=sent.text))
for k, token in enumerate(sent):
file_.write(token._.get_conllu_lines(k) + '\n')
file_.write(token._.get_conllu_lines(k) + "\n")
def print_progress(itn, losses, ud_scores):
fields = {
'dep_loss': losses.get('parser', 0.0),
'tag_loss': losses.get('tagger', 0.0),
'words': ud_scores['Words'].f1 * 100,
'sents': ud_scores['Sentences'].f1 * 100,
'tags': ud_scores['XPOS'].f1 * 100,
'uas': ud_scores['UAS'].f1 * 100,
'las': ud_scores['LAS'].f1 * 100,
"dep_loss": losses.get("parser", 0.0),
"tag_loss": losses.get("tagger", 0.0),
"words": ud_scores["Words"].f1 * 100,
"sents": ud_scores["Sentences"].f1 * 100,
"tags": ud_scores["XPOS"].f1 * 100,
"uas": ud_scores["UAS"].f1 * 100,
"las": ud_scores["LAS"].f1 * 100,
header = ['Epoch', 'Loss', 'LAS', 'UAS', 'TAG', 'SENT', 'WORD']
header = ["Epoch", "Loss", "LAS", "UAS", "TAG", "SENT", "WORD"]
if itn == 0:
tpl = '\t'.join((
tpl = "\t".join(
print(tpl.format(itn, **fields))
#def get_sent_conllu(sent, sent_id):
# def get_sent_conllu(sent, sent_id):
# lines = ["# sent_id = {sent_id}".format(sent_id=sent_id)]
def get_token_conllu(token, i):
if token._.begins_fused:
n = 1
while token.nbor(n)._.inside_fused:
n += 1
id_ = '%d-%d' % (i, i+n)
lines = [id_, token.text, '_', '_', '_', '_', '_', '_', '_', '_']
id_ = "%d-%d" % (i, i + n)
lines = [id_, token.text, "_", "_", "_", "_", "_", "_", "_", "_"]
lines = []
if token.head.i == token.i:
head = 0
head = i + (token.head.i - token.i) + 1
fields = [str(i+1), token.text, token.lemma_, token.pos_, token.tag_, '_',
str(head), token.dep_.lower(), '_', '_']
return '\n'.join(lines)
fields = [
str(i + 1),
return "\n".join(lines)
Token.set_extension('get_conllu_lines', method=get_token_conllu)
Token.set_extension('begins_fused', default=False)
Token.set_extension('inside_fused', default=False)
Token.set_extension("get_conllu_lines", method=get_token_conllu)
Token.set_extension("begins_fused", default=False)
Token.set_extension("inside_fused", default=False)
@ -274,31 +303,32 @@ Token.set_extension('inside_fused', default=False)
def load_nlp(corpus, config):
lang = corpus.split('_')[0]
lang = corpus.split("_")[0]
nlp = spacy.blank(lang)
if config.vectors:
nlp.vocab.from_disk(config.vectors / 'vocab')
nlp.vocab.from_disk(config.vectors / "vocab")
return nlp
def initialize_pipeline(nlp, docs, golds, config):
if config.multitask_tag:
if config.multitask_sent:
nlp.parser.moves.add_action(2, 'subtok')
nlp.parser.moves.add_action(2, "subtok")
for gold in golds:
for tag in gold.tags:
if tag is not None:
# Replace labels that didn't make the frequency cutoff
actions = set(nlp.parser.labels)
label_set = set([act.split('-')[1] for act in actions if '-' in act])
label_set = set([act.split("-")[1] for act in actions if "-" in act])
for gold in golds:
for i, label in enumerate(gold.labels):
if label is not None and label not in label_set:
gold.labels[i] = label.split('||')[0]
gold.labels[i] = label.split("||")[0]
return nlp.begin_training(lambda: golds_to_gold_tuples(docs, golds))
@ -306,6 +336,7 @@ def initialize_pipeline(nlp, docs, golds, config):
# Command line helpers #
class Config(object):
vectors = attr.ib(default=None)
@ -318,7 +349,7 @@ class Config(object):
def load(cls, loc):
with Path(loc).open('r', encoding='utf8') as file_:
with Path(loc).open("r", encoding="utf8") as file_:
cfg = json.load(file_)
return cls(**cfg)
@ -331,32 +362,36 @@ class Dataset(object):
self.text = None
for file_path in self.path.iterdir():
name = file_path.parts[-1]
if section in name and name.endswith('conllu'):
if section in name and name.endswith("conllu"):
self.conllu = file_path
elif section in name and name.endswith('txt'):
elif section in name and name.endswith("txt"):
self.text = file_path
if self.conllu is None:
msg = "Could not find .txt file in {path} for {section}"
raise IOError(msg.format(section=section, path=path))
if self.text is None:
msg = "Could not find .txt file in {path} for {section}"
self.lang = self.conllu.parts[-1].split('-')[0].split('_')[0]
self.lang = self.conllu.parts[-1].split("-")[0].split("_")[0]
class TreebankPaths(object):
def __init__(self, ud_path, treebank, **cfg):
self.train = Dataset(ud_path / treebank, 'train')
self.dev = Dataset(ud_path / treebank, 'dev')
self.train = Dataset(ud_path / treebank, "train")
self.dev = Dataset(ud_path / treebank, "dev")
self.lang = self.train.lang
ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
corpus=("UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
"positional", None, str),
"UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
parses_dir=("Directory to write the development parses", "positional", None, Path),
config=("Path to json formatted config file", "positional", None, Config.load),
limit=("Size limit", "option", "n", int)
limit=("Size limit", "option", "n", int),
def main(ud_dir, parses_dir, config, corpus, limit=0):
paths = TreebankPaths(ud_dir, corpus)
@ -365,8 +400,13 @@ def main(ud_dir, parses_dir, config, corpus, limit=0):
print("Train and evaluate", corpus, "using lang", paths.lang)
nlp = load_nlp(paths.lang, config)
docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(),
max_doc_length=config.max_doc_length, limit=limit)
docs, golds = read_data(
optimizer = initialize_pipeline(nlp, docs, golds, config)
@ -379,14 +419,19 @@ def main(ud_dir, parses_dir, config, corpus, limit=0):
for batch in batches:
batch_docs, batch_gold = zip(*batch)
pbar.update(sum(len(doc) for doc in batch_docs))
nlp.update(batch_docs, batch_gold, sgd=optimizer,
drop=config.dropout, losses=losses)
out_path = parses_dir / corpus / 'epoch-{i}.conllu'.format(i=i)
out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i)
with nlp.use_params(optimizer.averages):
scores = evaluate(nlp, paths.dev.text, paths.dev.conllu, out_path)
print_progress(i, losses, scores)
if __name__ == '__main__':
if __name__ == "__main__":
@ -1,4 +1,4 @@
'''This example shows how to add a multi-task objective that is trained
"""This example shows how to add a multi-task objective that is trained
alongside the entity recognizer. This is an alternative to adding features
to the model.
@ -19,7 +19,7 @@ The specific example here is not necessarily a good idea --- but it shows
how an arbitrary objective function for some word can be used.
Developed and tested for spaCy 2.0.6
import random
import plac
import spacy
@ -30,30 +30,29 @@ random.seed(0)
PWD = os.path.dirname(__file__)
TRAIN_DATA = list(read_json_file(os.path.join(PWD, 'training-data.json')))
TRAIN_DATA = list(read_json_file(os.path.join(PWD, "training-data.json")))
def get_position_label(i, words, tags, heads, labels, ents):
'''Return labels indicating the position of the word in the document.
"""Return labels indicating the position of the word in the document.
if len(words) < 20:
return 'short-doc'
return "short-doc"
elif i == 0:
return 'first-word'
return "first-word"
elif i < 10:
return 'early-word'
return "early-word"
elif i < 20:
return 'mid-word'
elif i == len(words)-1:
return 'last-word'
return "mid-word"
elif i == len(words) - 1:
return "last-word"
return 'late-word'
return "late-word"
def main(n_iter=10):
nlp = spacy.blank('en')
ner = nlp.create_pipe('ner')
nlp = spacy.blank("en")
ner = nlp.create_pipe("ner")
@ -71,15 +70,16 @@ def main(n_iter=10):
[gold], # batch of annotations
drop=0.2, # dropout - make it harder to memorise data
sgd=optimizer, # callable to update weights
print(losses.get('nn_labeller', 0.0), losses['ner'])
print(losses.get("nn_labeller", 0.0), losses["ner"])
# test the trained model
for text, _ in TRAIN_DATA:
doc = nlp(text)
print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
if __name__ == '__main__':
if __name__ == "__main__":
@ -1,4 +1,4 @@
'''This script is experimental.
"""This script is experimental.
Try pre-training the CNN component of the text categorizer using a cheap
language modelling-like objective. Specifically, we load pre-trained vectors
@ -12,7 +12,7 @@ To evaluate the technique, we're pre-training with the 50k texts from the IMDB
corpus, and then training with only 100 labels. Note that it's a bit dirty to
pre-train with the development data, but also not *so* terrible: we're not using
the development labels, after all --- only the unlabelled text.
import plac
import random
import spacy
@ -46,8 +46,8 @@ def load_textcat_data(limit=0):
train_data = train_data[-limit:]
texts, labels = zip(*train_data)
eval_texts, eval_labels = zip(*eval_data)
cats = [{'POSITIVE': bool(y), 'NEGATIVE': not bool(y)} for y in labels]
eval_cats = [{'POSITIVE': bool(y), 'NEGATIVE': not bool(y)} for y in eval_labels]
cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
eval_cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in eval_labels]
return (texts, cats), (eval_texts, eval_cats)
@ -57,6 +57,7 @@ def prefer_gpu():
return False
import cupy.random
return True
@ -68,7 +69,7 @@ def build_textcat_model(tok2vec, nr_class, width):
from thinc.misc import Residual, LayerNorm
from spacy._ml import logistic, zero_init
with Model.define_operators({'>>': chain}):
with Model.define_operators({">>": chain}):
model = (
>> flatten_add_lengths
@ -78,27 +79,35 @@ def build_textcat_model(tok2vec, nr_class, width):
model.tok2vec = tok2vec
return model
def block_gradients(model):
from thinc.api import wrap
def forward(X, drop=0.):
def forward(X, drop=0.0):
Y, _ = model.begin_update(X, drop=drop)
return Y, None
return wrap(forward, model)
def create_pipeline(width, embed_size, vectors_model):
print("Load vectors")
nlp = spacy.load(vectors_model)
print("Start training")
textcat = TextCategorizer(nlp.vocab,
labels=['POSITIVE', 'NEGATIVE'],
textcat = TextCategorizer(
labels=["POSITIVE", "NEGATIVE"],
Tok2Vec(width=width, embed_size=embed_size), 2, width))
Tok2Vec(width=width, embed_size=embed_size), 2, width
return nlp
def train_tensorizer(nlp, texts, dropout, n_iter):
tensorizer = nlp.create_pipe('tensorizer')
tensorizer = nlp.create_pipe("tensorizer")
optimizer = nlp.begin_training()
for i in range(n_iter):
@ -109,36 +118,43 @@ def train_tensorizer(nlp, texts, dropout, n_iter):
return optimizer
def train_textcat(nlp, n_texts, n_iter=10):
textcat = nlp.get_pipe('textcat')
textcat = nlp.get_pipe("textcat")
tok2vec_weights = textcat.model.tok2vec.to_bytes()
(train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts)
print("Using {} examples ({} training, {} evaluation)"
.format(n_texts, len(train_texts), len(dev_texts)))
train_data = list(zip(train_texts,
[{'cats': cats} for cats in train_cats]))
"Using {} examples ({} training, {} evaluation)".format(
n_texts, len(train_texts), len(dev_texts)
train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
with nlp.disable_pipes(*other_pipes): # only train textcat
optimizer = nlp.begin_training()
print("Training the model...")
print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
for i in range(n_iter):
losses = {'textcat': 0.0}
losses = {"textcat": 0.0}
# batch up the examples using spaCy's minibatch
batches = minibatch(tqdm.tqdm(train_data), size=2)
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
with textcat.model.use_params(optimizer.averages):
# evaluate on the dev data split off in load_data()
scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats)
print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}' # print a simple table
.format(losses['textcat'], scores['textcat_p'],
scores['textcat_r'], scores['textcat_f']))
"{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format( # print a simple table
def evaluate_textcat(tokenizer, textcat, texts, cats):
@ -153,9 +169,9 @@ def evaluate_textcat(tokenizer, textcat, texts, cats):
if label not in gold:
if score >= 0.5 and gold[label] >= 0.5:
tp += 1.
tp += 1.0
elif score >= 0.5 and gold[label] < 0.5:
fp += 1.
fp += 1.0
elif score < 0.5 and gold[label] < 0.5:
tn += 1
elif score < 0.5 and gold[label] >= 0.5:
@ -163,8 +179,7 @@ def evaluate_textcat(tokenizer, textcat, texts, cats):
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f_score = 2 * (precision * recall) / (precision + recall)
return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}
return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}
@ -173,10 +188,16 @@ def evaluate_textcat(tokenizer, textcat, texts, cats):
pretrain_iters=("Number of iterations to pretrain", "option", "pn", int),
train_iters=("Number of iterations to pretrain", "option", "tn", int),
train_examples=("Number of labelled examples", "option", "eg", int),
vectors_model=("Name or path to vectors model to learn from")
vectors_model=("Name or path to vectors model to learn from"),
def main(width, embed_size, vectors_model,
pretrain_iters=30, train_iters=30, train_examples=1000):
def main(
use_gpu = prefer_gpu()
@ -190,5 +211,6 @@ def main(width, embed_size, vectors_model,
print("Train textcat")
train_textcat(nlp, train_examples, n_iter=train_iters)
if __name__ == '__main__':
if __name__ == "__main__":
@ -29,73 +29,113 @@ from spacy.util import minibatch, compounding
# training data: texts, heads and dependency labels
# for no relation, we simply chose an arbitrary dependency label, e.g. '-'
("find a cafe with great wifi", {
'heads': [0, 2, 0, 5, 5, 2], # index of token head
'deps': ['ROOT', '-', 'PLACE', '-', 'QUALITY', 'ATTRIBUTE']
("find a hotel near the beach", {
'heads': [0, 2, 0, 5, 5, 2],
'deps': ['ROOT', '-', 'PLACE', 'QUALITY', '-', 'ATTRIBUTE']
("find me the closest gym that's open late", {
'heads': [0, 0, 4, 4, 0, 6, 4, 6, 6],
'deps': ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'ATTRIBUTE', 'TIME']
("show me the cheapest store that sells flowers", {
'heads': [0, 0, 4, 4, 0, 4, 4, 4], # attach "flowers" to store!
'deps': ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'PRODUCT']
("find a nice restaurant in london", {
'heads': [0, 3, 3, 0, 3, 3],
'deps': ['ROOT', '-', 'QUALITY', 'PLACE', '-', 'LOCATION']
("show me the coolest hostel in berlin", {
'heads': [0, 0, 4, 4, 0, 4, 4],
'deps': ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', 'LOCATION']
("find a good italian restaurant near work", {
'heads': [0, 4, 4, 4, 0, 4, 5],
"find a cafe with great wifi",
"heads": [0, 2, 0, 5, 5, 2], # index of token head
"deps": ["ROOT", "-", "PLACE", "-", "QUALITY", "ATTRIBUTE"],
"find a hotel near the beach",
"heads": [0, 2, 0, 5, 5, 2],
"deps": ["ROOT", "-", "PLACE", "QUALITY", "-", "ATTRIBUTE"],
"find me the closest gym that's open late",
"heads": [0, 0, 4, 4, 0, 6, 4, 6, 6],
"deps": [
"show me the cheapest store that sells flowers",
"heads": [0, 0, 4, 4, 0, 4, 4, 4], # attach "flowers" to store!
"deps": ["ROOT", "-", "-", "QUALITY", "PLACE", "-", "-", "PRODUCT"],
"find a nice restaurant in london",
"heads": [0, 3, 3, 0, 3, 3],
"deps": ["ROOT", "-", "QUALITY", "PLACE", "-", "LOCATION"],
"show me the coolest hostel in berlin",
"heads": [0, 0, 4, 4, 0, 4, 4],
"deps": ["ROOT", "-", "-", "QUALITY", "PLACE", "-", "LOCATION"],
"find a good italian restaurant near work",
"heads": [0, 4, 4, 4, 0, 4, 5],
"deps": [
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
output_dir=("Optional output directory", "option", "o", Path),
n_iter=("Number of training iterations", "option", "n", int))
n_iter=("Number of training iterations", "option", "n", int),
def main(model=None, output_dir=None, n_iter=15):
"""Load the model, set up the pipeline and train the parser."""
if model is not None:
nlp = spacy.load(model) # load existing spaCy model
print("Loaded model '%s'" % model)
nlp = spacy.blank('en') # create blank Language class
nlp = spacy.blank("en") # create blank Language class
print("Created blank 'en' model")
# We'll use the built-in dependency parser class, but we want to create a
# fresh instance – just in case.
if 'parser' in nlp.pipe_names:
parser = nlp.create_pipe('parser')
if "parser" in nlp.pipe_names:
parser = nlp.create_pipe("parser")
nlp.add_pipe(parser, first=True)
for text, annotations in TRAIN_DATA:
for dep in annotations.get('deps', []):
for dep in annotations.get("deps", []):
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser']
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "parser"]
with nlp.disable_pipes(*other_pipes): # only train parser
optimizer = nlp.begin_training()
for itn in range(n_iter):
losses = {}
# batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
print('Losses', losses)
print("Losses", losses)
# test the trained model
@ -115,16 +155,18 @@ def main(model=None, output_dir=None, n_iter=15):
def test_model(nlp):
texts = ["find a hotel with good wifi",
"find me the cheapest gym near work",
"show me the best hotel in berlin"]
texts = [
"find a hotel with good wifi",
"find me the cheapest gym near work",
"show me the best hotel in berlin",
docs = nlp.pipe(texts)
for doc in docs:
print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != '-'])
print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != "-"])
if __name__ == '__main__':
if __name__ == "__main__":
# Expected output:
@ -20,51 +20,48 @@ from spacy.util import minibatch, compounding
# training data
('Who is Shaka Khan?', {
'entities': [(7, 17, 'PERSON')]
('I like London and Berlin.', {
'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]
("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
output_dir=("Optional output directory", "option", "o", Path),
n_iter=("Number of training iterations", "option", "n", int))
n_iter=("Number of training iterations", "option", "n", int),
def main(model=None, output_dir=None, n_iter=100):
"""Load the model, set up the pipeline and train the entity recognizer."""
if model is not None:
nlp = spacy.load(model) # load existing spaCy model
print("Loaded model '%s'" % model)
nlp = spacy.blank('en') # create blank Language class
nlp = spacy.blank("en") # create blank Language class
print("Created blank 'en' model")
# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
if "ner" not in nlp.pipe_names:
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner, last=True)
# otherwise, get it so we can add labels
ner = nlp.get_pipe('ner')
ner = nlp.get_pipe("ner")
# add labels
for _, annotations in TRAIN_DATA:
for ent in annotations.get('entities'):
for ent in annotations.get("entities"):
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes): # only train NER
optimizer = nlp.begin_training()
for itn in range(n_iter):
losses = {}
# batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
texts, annotations = zip(*batch)
@ -72,14 +69,15 @@ def main(model=None, output_dir=None, n_iter=100):
annotations, # batch of annotations
drop=0.5, # dropout - make it harder to memorise data
sgd=optimizer, # callable to update weights
print('Losses', losses)
print("Losses", losses)
# test the trained model
for text, _ in TRAIN_DATA:
doc = nlp(text)
print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
# save model to output directory
if output_dir is not None:
@ -94,11 +92,11 @@ def main(model=None, output_dir=None, n_iter=100):
nlp2 = spacy.load(output_dir)
for text, _ in TRAIN_DATA:
doc = nlp2(text)
print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
if __name__ == '__main__':
if __name__ == "__main__":
# Expected output:
@ -35,7 +35,7 @@ from spacy.util import minibatch, compounding
# new entity label
# training data
# Note: If you're using an existing model, make sure to mix in examples of
@ -43,29 +43,21 @@ LABEL = 'ANIMAL'
# model might learn the new type, but "forget" what it previously knew.
# https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
("Horses are too tall and they pretend to care about your feelings", {
'entities': [(0, 6, 'ANIMAL')]
("Do they bite?", {
'entities': []
("horses are too tall and they pretend to care about your feelings", {
'entities': [(0, 6, 'ANIMAL')]
("horses pretend to care about your feelings", {
'entities': [(0, 6, 'ANIMAL')]
("they pretend to care about your feelings, those horses", {
'entities': [(48, 54, 'ANIMAL')]
("horses?", {
'entities': [(0, 6, 'ANIMAL')]
"Horses are too tall and they pretend to care about your feelings",
{"entities": [(0, 6, "ANIMAL")]},
("Do they bite?", {"entities": []}),
"horses are too tall and they pretend to care about your feelings",
{"entities": [(0, 6, "ANIMAL")]},
("horses pretend to care about your feelings", {"entities": [(0, 6, "ANIMAL")]}),
"they pretend to care about your feelings, those horses",
{"entities": [(48, 54, "ANIMAL")]},
("horses?", {"entities": [(0, 6, "ANIMAL")]}),
@ -73,25 +65,26 @@ TRAIN_DATA = [
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
new_model_name=("New model name for model meta.", "option", "nm", str),
output_dir=("Optional output directory", "option", "o", Path),
n_iter=("Number of training iterations", "option", "n", int))
def main(model=None, new_model_name='animal', output_dir=None, n_iter=10):
n_iter=("Number of training iterations", "option", "n", int),
def main(model=None, new_model_name="animal", output_dir=None, n_iter=10):
"""Set up the pipeline and entity recognizer, and train the new entity."""
if model is not None:
nlp = spacy.load(model) # load existing spaCy model
print("Loaded model '%s'" % model)
nlp = spacy.blank('en') # create blank Language class
nlp = spacy.blank("en") # create blank Language class
print("Created blank 'en' model")
# Add entity recognizer to model if it's not in the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
if "ner" not in nlp.pipe_names:
ner = nlp.create_pipe("ner")
# otherwise, get it, so we can add labels to it
ner = nlp.get_pipe('ner')
ner = nlp.get_pipe("ner")
ner.add_label(LABEL) # add new entity label to entity recognizer
ner.add_label(LABEL) # add new entity label to entity recognizer
if model is None:
optimizer = nlp.begin_training()
@ -100,21 +93,20 @@ def main(model=None, new_model_name='animal', output_dir=None, n_iter=10):
optimizer = nlp.entity.create_optimizer()
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes): # only train NER
for itn in range(n_iter):
losses = {}
# batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
print('Losses', losses)
nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
print("Losses", losses)
# test the trained model
test_text = 'Do you like horses?'
test_text = "Do you like horses?"
doc = nlp(test_text)
print("Entities in '%s'" % test_text)
for ent in doc.ents:
@ -125,7 +117,7 @@ def main(model=None, new_model_name='animal', output_dir=None, n_iter=10):
output_dir = Path(output_dir)
if not output_dir.exists():
nlp.meta['name'] = new_model_name # rename model
nlp.meta["name"] = new_model_name # rename model
print("Saved model to", output_dir)
@ -137,5 +129,5 @@ def main(model=None, new_model_name='animal', output_dir=None, n_iter=10):
print(ent.label_, ent.text)
if __name__ == '__main__':
if __name__ == "__main__":
@ -18,62 +18,69 @@ from spacy.util import minibatch, compounding
# training data
("They trade mortgage-backed securities.", {
'heads': [1, 1, 4, 4, 5, 1, 1],
'deps': ['nsubj', 'ROOT', 'compound', 'punct', 'nmod', 'dobj', 'punct']
("I like London and Berlin.", {
'heads': [1, 1, 1, 2, 2, 1],
'deps': ['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct']
"They trade mortgage-backed securities.",
"heads": [1, 1, 4, 4, 5, 1, 1],
"deps": ["nsubj", "ROOT", "compound", "punct", "nmod", "dobj", "punct"],
"I like London and Berlin.",
"heads": [1, 1, 1, 2, 2, 1],
"deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
output_dir=("Optional output directory", "option", "o", Path),
n_iter=("Number of training iterations", "option", "n", int))
n_iter=("Number of training iterations", "option", "n", int),
def main(model=None, output_dir=None, n_iter=10):
"""Load the model, set up the pipeline and train the parser."""
if model is not None:
nlp = spacy.load(model) # load existing spaCy model
print("Loaded model '%s'" % model)
nlp = spacy.blank('en') # create blank Language class
nlp = spacy.blank("en") # create blank Language class
print("Created blank 'en' model")
# add the parser to the pipeline if it doesn't exist
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'parser' not in nlp.pipe_names:
parser = nlp.create_pipe('parser')
if "parser" not in nlp.pipe_names:
parser = nlp.create_pipe("parser")
nlp.add_pipe(parser, first=True)
# otherwise, get it, so we can add labels to it
parser = nlp.get_pipe('parser')
parser = nlp.get_pipe("parser")
# add labels to the parser
for _, annotations in TRAIN_DATA:
for dep in annotations.get('deps', []):
for dep in annotations.get("deps", []):
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser']
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "parser"]
with nlp.disable_pipes(*other_pipes): # only train parser
optimizer = nlp.begin_training()
for itn in range(n_iter):
losses = {}
# batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
print('Losses', losses)
print("Losses", losses)
# test the trained model
test_text = "I like securities."
doc = nlp(test_text)
print('Dependencies', [(t.text, t.dep_, t.head.text) for t in doc])
print("Dependencies", [(t.text, t.dep_, t.head.text) for t in doc])
# save model to output directory
if output_dir is not None:
@ -87,10 +94,10 @@ def main(model=None, output_dir=None, n_iter=10):
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
doc = nlp2(test_text)
print('Dependencies', [(t.text, t.dep_, t.head.text) for t in doc])
print("Dependencies", [(t.text, t.dep_, t.head.text) for t in doc])
if __name__ == '__main__':
if __name__ == "__main__":
# expected result:
@ -25,11 +25,7 @@ from spacy.util import minibatch, compounding
# http://universaldependencies.github.io/docs/u/pos/index.html
# You may also specify morphological features for your tags, from the universal
# scheme.
'N': {'pos': 'NOUN'},
'V': {'pos': 'VERB'},
'J': {'pos': 'ADJ'}
TAG_MAP = {"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}, "J": {"pos": "ADJ"}}
# Usually you'll read this in, of course. Data formats vary. Ensure your
# strings are unicode and that the number of tags assigned matches spaCy's
@ -37,16 +33,17 @@ TAG_MAP = {
# that specifies the gold-standard tokenization, e.g.:
# ("Eatblueham", {'words': ['Eat', 'blue', 'ham'] 'tags': ['V', 'J', 'N']})
("I like green eggs", {'tags': ['N', 'V', 'J', 'N']}),
("Eat blue ham", {'tags': ['V', 'J', 'N']})
("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
("Eat blue ham", {"tags": ["V", "J", "N"]}),
lang=("ISO Code of language to use", "option", "l", str),
output_dir=("Optional output directory", "option", "o", Path),
n_iter=("Number of training iterations", "option", "n", int))
def main(lang='en', output_dir=None, n_iter=25):
n_iter=("Number of training iterations", "option", "n", int),
def main(lang="en", output_dir=None, n_iter=25):
"""Create a new model, set up the pipeline and train the tagger. In order to
train the tagger with a custom tag map, we're creating a new Language
instance with a custom vocab.
@ -54,7 +51,7 @@ def main(lang='en', output_dir=None, n_iter=25):
nlp = spacy.blank(lang)
# add the tagger to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
tagger = nlp.create_pipe('tagger')
tagger = nlp.create_pipe("tagger")
# Add the tags. This needs to be done before you start training.
for tag, values in TAG_MAP.items():
tagger.add_label(tag, values)
@ -65,16 +62,16 @@ def main(lang='en', output_dir=None, n_iter=25):
losses = {}
# batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
print('Losses', losses)
print("Losses", losses)
# test the trained model
test_text = "I like blue eggs"
doc = nlp(test_text)
print('Tags', [(t.text, t.tag_, t.pos_) for t in doc])
print("Tags", [(t.text, t.tag_, t.pos_) for t in doc])
# save model to output directory
if output_dir is not None:
@ -88,10 +85,10 @@ def main(lang='en', output_dir=None, n_iter=25):
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
doc = nlp2(test_text)
print('Tags', [(t.text, t.tag_, t.pos_) for t in doc])
print("Tags", [(t.text, t.tag_, t.pos_) for t in doc])
if __name__ == '__main__':
if __name__ == "__main__":
# Expected output:
@ -23,55 +23,62 @@ from spacy.util import minibatch, compounding
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
output_dir=("Optional output directory", "option", "o", Path),
n_texts=("Number of texts to train from", "option", "t", int),
n_iter=("Number of training iterations", "option", "n", int))
n_iter=("Number of training iterations", "option", "n", int),
def main(model=None, output_dir=None, n_iter=20, n_texts=2000):
if model is not None:
nlp = spacy.load(model) # load existing spaCy model
print("Loaded model '%s'" % model)
nlp = spacy.blank('en') # create blank Language class
nlp = spacy.blank("en") # create blank Language class
print("Created blank 'en' model")
# add the text classifier to the pipeline if it doesn't exist
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'textcat' not in nlp.pipe_names:
textcat = nlp.create_pipe('textcat')
if "textcat" not in nlp.pipe_names:
textcat = nlp.create_pipe("textcat")
nlp.add_pipe(textcat, last=True)
# otherwise, get it, so we can add labels to it
textcat = nlp.get_pipe('textcat')
textcat = nlp.get_pipe("textcat")
# add label to text classifier
# load the IMDB dataset
print("Loading IMDB data...")
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)
print("Using {} examples ({} training, {} evaluation)"
.format(n_texts, len(train_texts), len(dev_texts)))
train_data = list(zip(train_texts,
[{'cats': cats} for cats in train_cats]))
"Using {} examples ({} training, {} evaluation)".format(
n_texts, len(train_texts), len(dev_texts)
train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
with nlp.disable_pipes(*other_pipes): # only train textcat
optimizer = nlp.begin_training()
print("Training the model...")
print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
for i in range(n_iter):
losses = {}
# batch up the examples using spaCy's minibatch
batches = minibatch(train_data, size=compounding(4., 32., 1.001))
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
with textcat.model.use_params(optimizer.averages):
# evaluate on the dev data split off in load_data()
scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}' # print a simple table
.format(losses['textcat'], scores['textcat_p'],
scores['textcat_r'], scores['textcat_f']))
"{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format( # print a simple table
# test the trained model
test_text = "This movie sucked"
@ -99,7 +106,7 @@ def load_data(limit=0, split=0.8):
train_data = train_data[-limit:]
texts, labels = zip(*train_data)
cats = [{'POSITIVE': bool(y)} for y in labels]
cats = [{"POSITIVE": bool(y)} for y in labels]
split = int(len(train_data) * split)
return (texts[:split], cats[:split]), (texts[split:], cats[split:])
@ -116,9 +123,9 @@ def evaluate(tokenizer, textcat, texts, cats):
if label not in gold:
if score >= 0.5 and gold[label] >= 0.5:
tp += 1.
tp += 1.0
elif score >= 0.5 and gold[label] < 0.5:
fp += 1.
fp += 1.0
elif score < 0.5 and gold[label] < 0.5:
tn += 1
elif score < 0.5 and gold[label] >= 0.5:
@ -126,8 +133,8 @@ def evaluate(tokenizer, textcat, texts, cats):
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f_score = 2 * (precision * recall) / (precision + recall)
return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}
return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}
if __name__ == '__main__':
if __name__ == "__main__":
@ -14,8 +14,13 @@ from spacy.language import Language
vectors_loc=("Path to .vec file", "positional", None, str),
lang=("Optional language ID. If not set, blank Language() will be used.",
"positional", None, str))
"Optional language ID. If not set, blank Language() will be used.",
def main(vectors_loc, lang=None):
if lang is None:
nlp = Language()
@ -24,21 +29,21 @@ def main(vectors_loc, lang=None):
# save the model to disk and load it back later (models always need a
# "lang" setting). Use 'xx' for blank multi-language class.
nlp = spacy.blank(lang)
with open(vectors_loc, 'rb') as file_:
with open(vectors_loc, "rb") as file_:
header = file_.readline()
nr_row, nr_dim = header.split()
for line in file_:
line = line.rstrip().decode('utf8')
pieces = line.rsplit(' ', int(nr_dim))
line = line.rstrip().decode("utf8")
pieces = line.rsplit(" ", int(nr_dim))
word = pieces[0]
vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
vector = numpy.asarray([float(v) for v in pieces[1:]], dtype="f")
nlp.vocab.set_vector(word, vector) # add the vectors to the vocab
# test the vectors and similarity
text = 'class colspan'
text = "class colspan"
doc = nlp(text)
print(text, doc[0].similarity(doc[1]))
if __name__ == '__main__':
if __name__ == "__main__":
@ -14,26 +14,45 @@ import plac
import spacy
import tensorflow as tf
import tqdm
from tensorflow.contrib.tensorboard.plugins.projector import visualize_embeddings, ProjectorConfig
from tensorflow.contrib.tensorboard.plugins.projector import (
vectors_loc=("Path to spaCy model that contains vectors", "positional", None, str),
out_loc=("Path to output folder for tensorboard session data", "positional", None, str),
name=("Human readable name for tsv file and vectors tensor", "positional", None, str),
"Path to output folder for tensorboard session data",
"Human readable name for tsv file and vectors tensor",
def main(vectors_loc, out_loc, name="spaCy_vectors"):
meta_file = "{}.tsv".format(name)
out_meta_file = path.join(out_loc, meta_file)
print('Loading spaCy vectors model: {}'.format(vectors_loc))
print("Loading spaCy vectors model: {}".format(vectors_loc))
model = spacy.load(vectors_loc)
print('Finding lexemes with vectors attached: {}'.format(vectors_loc))
strings_stream = tqdm.tqdm(model.vocab.strings, total=len(model.vocab.strings), leave=False)
print("Finding lexemes with vectors attached: {}".format(vectors_loc))
strings_stream = tqdm.tqdm(
model.vocab.strings, total=len(model.vocab.strings), leave=False
queries = [w for w in strings_stream if model.vocab.has_vector(w)]
vector_count = len(queries)
print('Building Tensorboard Projector metadata for ({}) vectors: {}'.format(vector_count, out_meta_file))
"Building Tensorboard Projector metadata for ({}) vectors: {}".format(
vector_count, out_meta_file
# Store vector data in a tensorflow variable
tf_vectors_variable = numpy.zeros((vector_count, model.vocab.vectors.shape[1]))
@ -41,22 +60,26 @@ def main(vectors_loc, out_loc, name="spaCy_vectors"):
# Write a tab-separated file that contains information about the vectors for visualization
# Reference: https://www.tensorflow.org/programmers_guide/embedding#metadata
with open(out_meta_file, 'wb') as file_metadata:
with open(out_meta_file, "wb") as file_metadata:
# Define columns in the first row
# Write out a row for each vector that we add to the tensorflow variable we created
vec_index = 0
for text in tqdm.tqdm(queries, total=len(queries), leave=False):
# https://github.com/tensorflow/tensorflow/issues/9094
text = '<Space>' if text.lstrip() == '' else text
text = "<Space>" if text.lstrip() == "" else text
lex = model.vocab[text]
# Store vector data and metadata
tf_vectors_variable[vec_index] = model.vocab.get_vector(text)
file_metadata.write("{}\t{}\n".format(text, math.exp(lex.prob) * vector_count).encode('utf-8'))
"{}\t{}\n".format(text, math.exp(lex.prob) * vector_count).encode(
vec_index += 1
print('Running Tensorflow Session...')
print("Running Tensorflow Session...")
sess = tf.InteractiveSession()
tf.Variable(tf_vectors_variable, trainable=False, name=name)
@ -73,10 +96,10 @@ def main(vectors_loc, out_loc, name="spaCy_vectors"):
visualize_embeddings(writer, config)
# Save session and print run command to the output
print('Saving Tensorboard Session...')
saver.save(sess, path.join(out_loc, '{}.ckpt'.format(name)))
print('Done. Run `tensorboard --logdir={0}` to view in Tensorboard'.format(out_loc))
print("Saving Tensorboard Session...")
saver.save(sess, path.join(out_loc, "{}.ckpt".format(name)))
print("Done. Run `tensorboard --logdir={0}` to view in Tensorboard".format(out_loc))
if __name__ == '__main__':
if __name__ == "__main__":
@ -1,17 +1,17 @@
# Our libraries
# Third party dependencies
pathlib==1.0.1; python_version < "3.4"
# Development dependencies
@ -7,10 +7,27 @@ import sys
import contextlib
from distutils.command.build_ext import build_ext
from distutils.sysconfig import get_python_inc
import distutils.util
from distutils import ccompiler, msvccompiler
from setuptools import Extension, setup, find_packages
def is_new_osx():
'''Check whether we're on OSX >= 10.10'''
name = distutils.util.get_platform()
if sys.platform != 'darwin':
return False
elif name.startswith('macosx-10'):
minor_version = int(name.split('-')[1].split('.')[1])
if minor_version >= 7:
return True
return False
return False
PACKAGE_DATA = {"": ["*.pyx", "*.pxd", "*.txt", "*.tokens"]}
@ -57,8 +74,17 @@ COMPILE_OPTIONS = {
LINK_OPTIONS = {"msvc": [], "mingw32": [], "other": []}
# I don't understand this very well yet. See Issue #267
# Fingers crossed!
if is_new_osx():
# On Mac, use libc++ because Apple deprecated use of
# libstdc
# g++ (used by unix compiler on mac) links to libstdc++ as a default lib.
# See: https://stackoverflow.com/questions/1653047/avoid-linking-to-libstdc
USE_OPENMP_DEFAULT = "0" if sys.platform != "darwin" else None
if os.environ.get("USE_OPENMP", USE_OPENMP_DEFAULT) == "1":
if sys.platform == "darwin":
@ -200,15 +226,14 @@ def setup_package():
'pathlib==1.0.1; python_version < "3.4"',
@ -8,8 +8,9 @@ from thinc.t2t import ExtractWindow, ParametricAttention
from thinc.t2v import Pooling, sum_pool
from thinc.misc import Residual
from thinc.misc import LayerNorm as LN
from thinc.misc import FeatureExtracter
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
from thinc.api import FeatureExtracter, with_getitem, flatten_add_lengths
from thinc.api import with_getitem, flatten_add_lengths
from thinc.api import uniqued, wrap, noop
from thinc.api import with_square_sequences
from thinc.linear.linear import LinearModel
@ -10,7 +10,7 @@ __uri__ = "https://spacy.io"
__author__ = "Explosion AI"
__email__ = "contact@explosion.ai"
__license__ = "MIT"
__release__ = False
__release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
@ -4,9 +4,9 @@ from __future__ import unicode_literals
import plac
from pathlib import Path
from wasabi import Printer
import srsly
from ..util import write_jsonl, write_json
from ..compat import json_dumps, path2str
from ..compat import path2str
from .converters import conllu2json, conllubio2json, iob2json, conll_ner2json
from .converters import ner_jsonl2json
from ._messages import Messages
@ -77,9 +77,9 @@ def convert(
suffix = ".{}".format(file_type)
output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
if file_type == "json":
write_json(output_file, data)
srsly.write_json(output_file, data)
elif file_type == "jsonl":
write_jsonl(output_file, data)
srsly.write_jsonl(output_file, data)
@ -87,7 +87,6 @@ def convert(
# Print to stdout
if file_type == "json":
srsly.write_json("-", data)
elif file_type == "jsonl":
for line in data:
srsly.write_jsonl("-", data)
@ -1,9 +1,8 @@
# coding: utf8
from __future__ import unicode_literals
from cytoolz import partition_all
from ...gold import iob_to_biluo
from ...util import minibatch
def iob2json(input_data, n_sents=10, *args, **kwargs):
@ -11,7 +10,7 @@ def iob2json(input_data, n_sents=10, *args, **kwargs):
Convert IOB files into JSON format for use with train cli.
docs = []
for group in partition_all(n_sents, docs):
for group in minibatch(docs, n_sents):
group = list(group)
first = group.pop(0)
to_extend = first["paragraphs"][0]["sentences"]
@ -1,7 +1,7 @@
# coding: utf8
from __future__ import unicode_literals
import ujson
import srsly
from ...util import get_lang_class
from .._messages import Messages
@ -11,7 +11,7 @@ def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False):
if lang is None:
raise ValueError(Messages.M054)
json_docs = []
input_tuples = [ujson.loads(line) for line in input_data]
input_tuples = [srsly.json_loads(line) for line in input_data]
nlp = get_lang_class(lang)()
for i, (raw_text, ents) in enumerate(input_tuples):
doc = nlp.make_doc(raw_text)
@ -5,10 +5,11 @@ from pathlib import Path
from collections import Counter
import plac
import sys
import srsly
from wasabi import Printer, MESSAGES
from ..gold import GoldCorpus, read_json_object
from ..util import load_model, get_lang_class, read_json, read_jsonl
from ..util import load_model, get_lang_class
# from .schemas import get_schema, validate_json
from ._messages import Messages
@ -320,11 +321,11 @@ def debug_data(
def _load_file(file_path, msg):
file_name = file_path.parts[-1]
if file_path.suffix == ".json":
data = read_json(file_path)
data = srsly.read_json(file_path)
msg.good("Loaded {}".format(file_name))
return data
elif file_path.suffix == ".jsonl":
data = read_jsonl(file_path)
data = srsly.read_jsonl(file_path)
msg.good("Loaded {}".format(file_name))
return data
@ -5,6 +5,7 @@ import plac
import platform
from pathlib import Path
from wasabi import Printer
import srsly
from ._messages import Messages
from ..compat import path2str, basestring_, unicode_
@ -32,7 +33,7 @@ def info(model=None, markdown=False, silent=False):
meta_path = model_path / "meta.json"
if not meta_path.is_file():
msg.fail(Messages.M020, meta_path, exits=1)
meta = util.read_json(meta_path)
meta = srsly.read_json(meta_path)
if model_path.resolve() != model_path:
meta["link"] = path2str(model_path)
meta["source"] = path2str(model_path.resolve())
@ -11,12 +11,13 @@ from preshed.counter import PreshCounter
import tarfile
import gzip
import zipfile
import srsly
from wasabi import Printer
from ._messages import Messages
from ..vectors import Vectors
from ..errors import Errors, Warnings, user_warning
from ..util import ensure_path, get_lang_class, read_jsonl
from ..util import ensure_path, get_lang_class
import ftfy
@ -33,7 +34,7 @@ msg = Printer()
freqs_loc=("Location of words frequencies file", "option", "f", Path),
jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path),
clusters_loc=("Optional location of brown clusters data", "option", "c", str),
vectors_loc=("Optional vectors file in Word2Vec format" "option", "v", str),
vectors_loc=("Optional vectors file in Word2Vec format", "option", "v", str),
prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
def init_model(
@ -59,7 +60,7 @@ def init_model(
msg.warn(Messages.M063, Messages.M064)
jsonl_loc = ensure_path(jsonl_loc)
lex_attrs = read_jsonl(jsonl_loc)
lex_attrs = srsly.read_jsonl(jsonl_loc)
clusters_loc = ensure_path(clusters_loc)
freqs_loc = ensure_path(freqs_loc)
@ -5,9 +5,10 @@ import plac
import shutil
from pathlib import Path
from wasabi import Printer, get_raw_input
import srsly
from ._messages import Messages
from ..compat import path2str, json_dumps
from ..compat import path2str
from .. import util
from .. import about
@ -40,7 +41,7 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals
meta_path = meta_path or input_path / "meta.json"
if meta_path.is_file():
meta = util.read_json(meta_path)
meta = srsly.read_json(meta_path)
if not create_meta: # only print if user doesn't want to overwrite
msg.good(Messages.M041, meta_path)
@ -64,7 +65,7 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals
Path.mkdir(package_path, parents=True)
shutil.copytree(path2str(input_path), path2str(package_path / model_name_v))
create_file(main_path / "meta.json", json_dumps(meta))
create_file(main_path / "meta.json", srsly.json_dumps(meta))
create_file(main_path / "setup.py", TEMPLATE_SETUP)
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
create_file(package_path / "__init__.py", TEMPLATE_INIT)
@ -5,8 +5,6 @@ import plac
import random
import numpy
import time
import ujson
import sys
from collections import Counter
from pathlib import Path
from thinc.v2v import Affine, Maxout
@ -14,10 +12,10 @@ from thinc.api import wrap
from thinc.misc import LayerNorm as LN
from thinc.neural.util import prefer_gpu
from wasabi import Printer
import srsly
from ..tokens import Doc
from ..attrs import ID, HEAD
from ..compat import json_dumps
from .._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
from .. import util
@ -72,7 +70,7 @@ def pretrain(
if not output_dir.exists():
msg.good("Created output directory")
util.write_json(output_dir / "config.json", config)
srsly.write_json(output_dir / "config.json", config)
msg.good("Saved settings to config.json")
# Load texts from file or stdin
@ -81,12 +79,12 @@ def pretrain(
if not texts_loc.exists():
msg.fail("Input text file doesn't exist", texts_loc, exits=1)
with msg.loading("Loading input texts..."):
texts = list(util.read_jsonl(texts_loc))
texts = list(srsly.read_jsonl(texts_loc))
msg.good("Loaded input texts")
else: # reading from stdin
msg.text("Reading input text from stdin...")
texts = stream_texts()
texts = srsly.read_jsonl("-")
with msg.loading("Loading model '{}'...".format(vectors_model)):
nlp = util.load_model(vectors_model)
@ -130,18 +128,13 @@ def pretrain(
"epoch": epoch,
with (output_dir / "log.jsonl").open("a") as file_:
file_.write(json_dumps(log) + "\n")
file_.write(srsly.json_dumps(log) + "\n")
tracker.epoch_loss = 0.0
if texts_loc != "-":
# Reshuffle the texts if texts were loaded from a file
def stream_texts():
for line in sys.stdin:
yield ujson.loads(line)
def make_update(model, docs, optimizer, drop=0.0):
"""Perform an update over a single batch of documents.
@ -3,12 +3,12 @@ from __future__ import unicode_literals, division, print_function
import plac
from pathlib import Path
import ujson
import srsly
import cProfile
import pstats
import sys
import tqdm
import cytoolz
import itertools
import thinc.extra.datasets
from wasabi import Printer
@ -40,7 +40,7 @@ def profile(model, inputs=None, n_texts=10000):
with msg.loading("Loading model '{}'...".format(model)):
nlp = load_model(model)
msg.good("Loaded model '{}'".format(model))
texts = list(cytoolz.take(n_texts, inputs))
texts = list(itertools.islice(inputs, n_texts))
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
s = pstats.Stats("Profile.prof")
msg.divider("Profile stats")
@ -64,6 +64,6 @@ def _read_inputs(loc, msg):
msg.info("Using data from {}".format(input_path.parts[-1]))
file_ = input_path.open()
for line in file_:
data = ujson.loads(line)
data = srsly.json_loads(line)
text = data["text"]
yield text
@ -3,9 +3,9 @@ from __future__ import unicode_literals
from pathlib import Path
from jsonschema import Draft4Validator
import srsly
from ...errors import Errors
from ...util import read_json
@ -25,7 +25,7 @@ def get_schema(name):
schema_path = Path(__file__).parent / "{}.json".format(name)
if not schema_path.exists():
raise ValueError(Errors.E104.format(name=name))
schema = read_json(schema_path)
schema = srsly.read_json(schema_path)
# TODO: replace with (stable) Draft6Validator, if available
validator = Draft4Validator(schema)
@ -7,6 +7,7 @@ import tqdm
from thinc.neural._classes.model import Model
from timeit import default_timer as timer
import shutil
import srsly
from wasabi import Printer
from ._messages import Messages
@ -111,7 +112,7 @@ def train(
msg.fail(Messages.M051, dev_path, exits=1)
if meta_path is not None and not meta_path.exists():
msg.fail(Messages.M020, meta_path, exits=1)
meta = util.read_json(meta_path) if meta_path else {}
meta = srsly.read_json(meta_path) if meta_path else {}
if not isinstance(meta, dict):
msg.fail(Messages.M052, Messages.M053.format(meta_type=type(meta)), exits=1)
if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
@ -226,7 +227,7 @@ def train(
end_time = timer()
cpu_wps = nwords / (end_time - start_time)
acc_loc = output_path / ("model%d" % i) / "accuracy.json"
util.write_json(acc_loc, scorer.scores)
srsly.write_json(acc_loc, scorer.scores)
# Update model meta.json
meta["lang"] = nlp.lang
@ -242,7 +243,7 @@ def train(
meta.setdefault("name", "model%d" % i)
meta.setdefault("version", version)
meta_loc = output_path / ("model%d" % i) / "meta.json"
util.write_json(meta_loc, meta)
srsly.write_json(meta_loc, meta)
@ -293,17 +294,17 @@ def _collate_best_model(meta, output_path, components):
for component, best_component_src in bests.items():
shutil.rmtree(best_dest / component)
shutil.copytree(best_component_src / component, best_dest / component)
accs = util.read_json(best_component_src / "accuracy.json")
accs = srsly.read_json(best_component_src / "accuracy.json")
for metric in _get_metrics(component):
meta["accuracy"][metric] = accs[metric]
util.write_json(best_dest / "meta.json", meta)
srsly.write_json(best_dest / "meta.json", meta)
def _find_best(experiment_dir, component):
accuracies = []
for epoch_model in experiment_dir.iterdir():
if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final":
accs = util.read_json(epoch_model / "accuracy.json")
accs = srsly.read_json(epoch_model / "accuracy.json")
scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)]
accuracies.append((scores, epoch_model))
if accuracies:
@ -9,7 +9,7 @@ import tqdm
from pathlib import Path
import re
import sys
import json
import srsly
import spacy
import spacy.util
@ -30,7 +30,6 @@ Fused_inside = None
import itertools
import random
import numpy.random
import cytoolz
from . import conll17_ud_eval
@ -44,7 +43,7 @@ from ...lang import ru
# Data reading #
space_re = re.compile("\s+")
space_re = re.compile(r"\s+")
def split_text(text):
@ -332,8 +331,7 @@ def main(test_data_dir, experiment_dir, corpus):
/ corpus
/ "{section}-accuracy.json".format(section=section)
with open(acc_path, "w") as file_:
file_.write(json.dumps(accuracy, indent=2))
srsly.write_json(acc_path, accuracy)
if __name__ == "__main__":
@ -25,7 +25,6 @@ from timeit import default_timer as timer
import itertools
import random
import numpy.random
import cytoolz
from . import conll17_ud_eval
@ -5,11 +5,12 @@ import pkg_resources
from pathlib import Path
import sys
import requests
import srsly
from wasabi import Printer
from ._messages import Messages
from ..compat import path2str
from ..util import get_data_path, read_json
from ..util import get_data_path
from .. import about
@ -84,7 +85,7 @@ def get_model_links(compat):
meta_path = Path(model) / "meta.json"
if not meta_path.exists():
meta = read_json(meta_path)
meta = srsly.read_json(meta_path)
link = model.parts[-1]
name = meta["lang"] + "_" + meta["name"]
links[link] = {
@ -3,7 +3,6 @@ from __future__ import unicode_literals
import os
import sys
import ujson
import itertools
from thinc.neural.util import copy_array
@ -54,9 +53,6 @@ if is_python2:
unicode_ = unicode # noqa: F821
basestring_ = basestring # noqa: F821
input_ = raw_input # noqa: F821
json_dumps = lambda data, indent=2: ujson.dumps(
data, indent=indent, escape_forward_slashes=False
path2str = lambda path: str(path).decode("utf8")
elif is_python3:
@ -64,9 +60,6 @@ elif is_python3:
unicode_ = str
basestring_ = str
input_ = input
json_dumps = lambda data, indent=2: ujson.dumps(
data, indent=indent, escape_forward_slashes=False
path2str = lambda path: str(path)
@ -4,16 +4,11 @@ from __future__ import unicode_literals, print_function
import re
import random
import cytoolz
import itertools
import numpy
import tempfile
import shutil
from pathlib import Path
import msgpack
import json
import ujson
import srsly
from . import _align
from .syntax import nonproj
@ -21,7 +16,6 @@ from .tokens import Doc
from .errors import Errors
from . import util
from .util import minibatch, itershuffle
from .compat import json_dumps
from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek
@ -123,12 +117,11 @@ class GoldCorpus(object):
n = 0
for i, doc_tuple in enumerate(doc_tuples):
with open(directory / '{}.msg'.format(i), 'wb') as file_:
msgpack.dump([doc_tuple], file_, use_bin_type=True)
srsly.write_msgpack(directory / '{}.msg'.format(i), [doc_tuple])
n += len(doc_tuple[1])
if limit and n >= limit:
def walk_corpus(path):
path = util.ensure_path(path)
@ -157,8 +150,7 @@ class GoldCorpus(object):
if loc.parts[-1].endswith('json'):
gold_tuples = read_json_file(loc)
elif loc.parts[-1].endswith('msg'):
with loc.open('rb') as file_:
gold_tuples = msgpack.load(file_, raw=False)
gold_tuples = srsly.read_msgpack(loc)
msg = "Cannot read from file: %s. Supported formats: .json, .msg"
raise ValueError(msg % loc)
@ -378,7 +370,7 @@ def _json_iterate(loc):
if square_depth == 1 and curly_depth == 0:
py_str = py_raw[start : i+1].decode('utf8')
yield json.loads(py_str)
yield srsly.json_loads(py_str)
except Exception:
@ -2,7 +2,6 @@
from __future__ import absolute_import, unicode_literals
import random
import ujson
import itertools
import weakref
import functools
@ -10,6 +9,7 @@ from collections import OrderedDict
from contextlib import contextmanager
from copy import copy
from thinc.neural import Model
import srsly
from .tokenizer import Tokenizer
from .vocab import Vocab
@ -18,7 +18,7 @@ from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter
from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
from .pipeline import EntityRuler
from .compat import json_dumps, izip, basestring_
from .compat import izip, basestring_
from .gold import GoldParse
from .scorer import Scorer
from ._ml import link_vectors_to_models, create_default_optimizer
@ -640,7 +640,7 @@ class Language(object):
serializers = OrderedDict(
("tokenizer", lambda p: self.tokenizer.to_disk(p, vocab=False)),
("meta.json", lambda p: p.open("w").write(json_dumps(self.meta))),
("meta.json", lambda p: p.open("w").write(srsly.json_dumps(self.meta))),
for name, proc in self.pipeline:
@ -671,7 +671,7 @@ class Language(object):
path = util.ensure_path(path)
deserializers = OrderedDict(
("meta.json", lambda p: self.meta.update(util.read_json(p))),
("meta.json", lambda p: self.meta.update(srsly.read_json(p))),
lambda p: (
@ -705,7 +705,7 @@ class Language(object):
("vocab", lambda: self.vocab.to_bytes()),
("tokenizer", lambda: self.tokenizer.to_bytes(vocab=False)),
("meta", lambda: json_dumps(self.meta)),
("meta", lambda: srsly.json_dumps(self.meta)),
for i, (name, proc) in enumerate(self.pipeline):
@ -725,7 +725,7 @@ class Language(object):
deserializers = OrderedDict(
("meta", lambda b: self.meta.update(ujson.loads(b))),
("meta", lambda b: self.meta.update(srsly.json_loads(b))),
lambda b: (
@ -5,12 +5,8 @@ from __future__ import unicode_literals
import numpy
cimport numpy as np
import cytoolz
from collections import OrderedDict, defaultdict
import ujson
from .util import msgpack
from .util import msgpack_numpy
import srsly
from thinc.api import chain
from thinc.v2v import Affine, Maxout, Softmax
@ -27,7 +23,6 @@ from .syntax.arc_eager cimport ArcEager
from .morphology cimport Morphology
from .vocab cimport Vocab
from .syntax import nonproj
from .compat import json_dumps
from .matcher import Matcher
from .matcher import Matcher, PhraseMatcher
@ -38,7 +33,7 @@ from ._ml import Tok2Vec, build_text_classifier, build_tagger_model
from ._ml import link_vectors_to_models, zero_init, flatten
from ._ml import create_default_optimizer
from .errors import Errors, TempErrors
from .compat import json_dumps, basestring_
from .compat import basestring_
from . import util
@ -235,7 +230,7 @@ class EntityRuler(object):
**kwargs: Other config paramters, mostly for consistency.
RETURNS (EntityRuler): The loaded entity ruler.
patterns = msgpack.loads(patterns_bytes, raw=False)
patterns = srsly.msgpack_loads(patterns_bytes)
return self
@ -244,7 +239,7 @@ class EntityRuler(object):
RETURNS (bytes): The serialized patterns.
return msgpack.dumps(self.patterns, use_bin_type=True)
return srsly.msgpack_dumps(self.patterns)
def from_disk(self, path, **kwargs):
"""Load the entity ruler from a file. Expects a file containing
@ -256,7 +251,7 @@ class EntityRuler(object):
path = util.ensure_path(path)
path = path.with_suffix('.jsonl')
patterns = util.read_jsonl(path)
patterns = srsly.read_jsonl(path)
return self
@ -270,8 +265,7 @@ class EntityRuler(object):
path = util.ensure_path(path)
path = path.with_suffix('.jsonl')
data = [json_dumps(line, indent=0) for line in self.patterns]
srsly.write_jsonl(path, self.patterns)
class Pipe(object):
@ -307,7 +301,7 @@ class Pipe(object):
Both __call__ and pipe should delegate to the `predict()`
and `set_annotations()` methods.
for docs in cytoolz.partition_all(batch_size, stream):
for docs in util.minibatch(stream, size=batch_size):
docs = list(docs)
scores, tensors = self.predict(docs)
self.set_annotations(docs, scores, tensor=tensors)
@ -368,7 +362,7 @@ class Pipe(object):
def to_bytes(self, **exclude):
"""Serialize the pipe to a bytestring."""
serialize = OrderedDict()
serialize['cfg'] = lambda: json_dumps(self.cfg)
serialize['cfg'] = lambda: srsly.json_dumps(self.cfg)
if self.model in (True, False, None):
serialize['model'] = lambda: self.model
@ -387,7 +381,7 @@ class Pipe(object):
deserialize = OrderedDict((
('cfg', lambda b: self.cfg.update(ujson.loads(b))),
('cfg', lambda b: self.cfg.update(srsly.json_loads(b))),
('vocab', lambda b: self.vocab.from_bytes(b)),
('model', load_model),
@ -397,7 +391,7 @@ class Pipe(object):
def to_disk(self, path, **exclude):
"""Serialize the pipe to disk."""
serialize = OrderedDict()
serialize['cfg'] = lambda p: p.open('w').write(json_dumps(self.cfg))
serialize['cfg'] = lambda p: srsly.write_json(p, self.cfg)
serialize['vocab'] = lambda p: self.vocab.to_disk(p)
if self.model not in (None, True, False):
serialize['model'] = lambda p: p.open('wb').write(self.model.to_bytes())
@ -424,8 +418,7 @@ class Pipe(object):
def _load_cfg(path):
if path.exists():
with path.open() as file_:
return ujson.load(file_)
return srsly.read_json(path)
return {}
@ -485,7 +478,7 @@ class Tensorizer(Pipe):
n_threads (int): Number of threads.
YIELDS (iterator): A sequence of `Doc` objects, in order of input.
for docs in cytoolz.partition_all(batch_size, stream):
for docs in util.minibatch(stream, size=batch_size):
docs = list(docs)
tensors = self.predict(docs)
self.set_annotations(docs, tensors)
@ -594,7 +587,7 @@ class Tagger(Pipe):
return doc
def pipe(self, stream, batch_size=128, n_threads=-1):
for docs in cytoolz.partition_all(batch_size, stream):
for docs in util.minibatch(stream, size=batch_size):
docs = list(docs)
tag_ids, tokvecs = self.predict(docs)
self.set_annotations(docs, tag_ids, tensors=tokvecs)
@ -745,10 +738,9 @@ class Tagger(Pipe):
serialize['model'] = self.model.to_bytes
serialize['vocab'] = self.vocab.to_bytes
serialize['cfg'] = lambda: ujson.dumps(self.cfg)
serialize['cfg'] = lambda: srsly.json_dumps(self.cfg)
tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
serialize['tag_map'] = lambda: msgpack.dumps(
tag_map, use_bin_type=True)
serialize['tag_map'] = lambda: srsly.msgpack_dumps(tag_map)
return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, **exclude):
@ -766,7 +758,7 @@ class Tagger(Pipe):
def load_tag_map(b):
tag_map = msgpack.loads(b, raw=False)
tag_map = srsly.msgpack_loads(b)
self.vocab.morphology = Morphology(
self.vocab.strings, tag_map=tag_map,
@ -775,7 +767,7 @@ class Tagger(Pipe):
deserialize = OrderedDict((
('vocab', lambda b: self.vocab.from_bytes(b)),
('tag_map', load_tag_map),
('cfg', lambda b: self.cfg.update(ujson.loads(b))),
('cfg', lambda b: self.cfg.update(srsly.json_loads(b))),
('model', lambda b: load_model(b)),
util.from_bytes(bytes_data, deserialize, exclude)
@ -785,10 +777,9 @@ class Tagger(Pipe):
tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
serialize = OrderedDict((
('vocab', lambda p: self.vocab.to_disk(p)),
('tag_map', lambda p: p.open('wb').write(msgpack.dumps(
tag_map, use_bin_type=True))),
('tag_map', lambda p: srsly.write_msgpack(p, tag_map)),
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
('cfg', lambda p: p.open('w').write(json_dumps(self.cfg)))
('cfg', lambda p: srsly.write_json(p, self.cfg))
util.to_disk(path, serialize, exclude)
@ -803,8 +794,7 @@ class Tagger(Pipe):
def load_tag_map(p):
with p.open('rb') as file_:
tag_map = msgpack.loads(file_.read(), raw=False)
tag_map = srsly.read_msgpack(p)
self.vocab.morphology = Morphology(
self.vocab.strings, tag_map=tag_map,
@ -1082,7 +1072,7 @@ class TextCategorizer(Pipe):
return doc
def pipe(self, stream, batch_size=128, n_threads=-1):
for docs in cytoolz.partition_all(batch_size, stream):
for docs in util.minibatch(stream, size=batch_size):
docs = list(docs)
scores, tensors = self.predict(docs)
self.set_annotations(docs, scores, tensors=tensors)
@ -7,12 +7,11 @@ from libc.string cimport memcpy
from libcpp.set cimport set
from libc.stdint cimport uint32_t
from murmurhash.mrmr cimport hash64, hash32
import ujson
import srsly
from .symbols import IDS as SYMBOLS_BY_STR
from .symbols import NAMES as SYMBOLS_BY_INT
from .typedefs cimport hash_t
from .compat import json_dumps
from .errors import Errors
from . import util
@ -197,8 +196,7 @@ cdef class StringStore:
path = util.ensure_path(path)
strings = list(self)
with path.open('w') as file_:
srsly.write_json(path, strings)
def from_disk(self, path):
"""Loads state from a directory. Modifies the object in place and
@ -209,8 +207,7 @@ cdef class StringStore:
RETURNS (StringStore): The modified `StringStore` object.
path = util.ensure_path(path)
with path.open('r') as file_:
strings = ujson.load(file_)
strings = srsly.read_json(path)
prev = list(self)
for word in prev:
@ -223,7 +220,7 @@ cdef class StringStore:
**exclude: Named attributes to prevent from being serialized.
RETURNS (bytes): The serialized form of the `StringStore` object.
return json_dumps(list(self))
return srsly.json_dumps(list(self))
def from_bytes(self, bytes_data, **exclude):
"""Load state from a binary string.
@ -232,7 +229,7 @@ cdef class StringStore:
**exclude: Named attributes to prevent from being loaded.
RETURNS (StringStore): The `StringStore` object.
strings = ujson.loads(bytes_data)
strings = srsly.json_loads(bytes_data)
prev = list(self)
for word in prev:
@ -5,11 +5,8 @@
from __future__ import unicode_literals, print_function
from collections import OrderedDict
import ujson
import json
import numpy
cimport cython.parallel
import cytoolz
import numpy.random
cimport numpy as np
from libc.math cimport exp
@ -29,7 +26,7 @@ cimport blis.cy
from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten
from .._ml import link_vectors_to_models, create_default_optimizer
from ..compat import json_dumps, copy_array
from ..compat import copy_array
from ..tokens.doc cimport Doc
from ..gold cimport GoldParse
from ..errors import Errors, TempErrors
@ -119,7 +116,7 @@ cdef void predict_states(ActivationsC* A, StateC** states,
W.hidden_bias, 1., n.classes)
cdef void sum_state_features(float* output,
const float* cached, const int* token_ids, int B, int F, int O) nogil:
cdef int idx, b, f, i
@ -165,7 +162,7 @@ cdef void cpu_log_loss(float* d_scores,
d_scores[i] = exp(scores[i]-max_) / Z
cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs,
const int* is_valid, int n) nogil:
# Find minimum cost
@ -218,15 +215,15 @@ class ParserModel(Model):
def begin_training(self, X, y=None):
self.lower.begin_training(X, y=y)
def tok2vec(self):
return self._layers[0]
def lower(self):
return self._layers[1]
def upper(self):
return self._layers[2]
@ -405,4 +402,3 @@ cdef class precompute_hiddens:
return self.ops.backprop_maxout(d_best, mask, self.nP)
return state_vector, backprop_nonlinearity
@ -5,13 +5,11 @@
from __future__ import unicode_literals, print_function
from collections import OrderedDict
import ujson
import json
import numpy
cimport cython.parallel
import cytoolz
import numpy.random
cimport numpy as np
from itertools import islice
from cpython.ref cimport PyObject, Py_XDECREF
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
from libc.math cimport exp
@ -27,6 +25,7 @@ from thinc.misc import LayerNorm
from thinc.neural.ops import CupyOps
from thinc.neural.util import get_array_module
from thinc.linalg cimport Vec, VecVec
import srsly
from ._parser_model cimport resize_activations, predict_states, arg_max_if_valid
from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
@ -34,7 +33,7 @@ from ._parser_model cimport get_c_weights, get_c_sizes
from ._parser_model import ParserModel
from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten
from .._ml import link_vectors_to_models, create_default_optimizer
from ..compat import json_dumps, copy_array
from ..compat import copy_array
from ..tokens.doc cimport Doc
from ..gold cimport GoldParse
from ..errors import Errors, TempErrors
@ -214,10 +213,10 @@ cdef class Parser:
beam_width = self.cfg.get('beam_width', 1)
beam_density = self.cfg.get('beam_density', 0.)
cdef Doc doc
for batch in cytoolz.partition_all(batch_size, docs):
for batch in util.minibatch(docs, size=batch_size):
batch_in_order = list(batch)
by_length = sorted(batch_in_order, key=lambda doc: len(doc))
for subbatch in cytoolz.partition_all(8, by_length):
for subbatch in util.minibatch(by_length, size=batch_size//4):
subbatch = list(subbatch)
parse_states = self.predict(subbatch, beam_width=beam_width,
@ -517,7 +516,7 @@ cdef class Parser:
sgd = self.create_optimizer()
doc_sample = []
gold_sample = []
for raw_text, annots_brackets in cytoolz.take(1000, get_gold_tuples()):
for raw_text, annots_brackets in islice(get_gold_tuples(), 1000):
for annots, brackets in annots_brackets:
ids, words, tags, heads, deps, ents = annots
doc_sample.append(Doc(self.vocab, words=words))
@ -539,7 +538,7 @@ cdef class Parser:
'model': lambda p: (self.model.to_disk(p) if self.model is not True else True),
'vocab': lambda p: self.vocab.to_disk(p),
'moves': lambda p: self.moves.to_disk(p, strings=False),
'cfg': lambda p: p.open('w').write(json_dumps(self.cfg))
'cfg': lambda p: srsly.write_json(p, self.cfg)
util.to_disk(path, serializers, exclude)
@ -547,7 +546,7 @@ cdef class Parser:
deserializers = {
'vocab': lambda p: self.vocab.from_disk(p),
'moves': lambda p: self.moves.from_disk(p, strings=False),
'cfg': lambda p: self.cfg.update(util.read_json(p)),
'cfg': lambda p: self.cfg.update(srsly.read_json(p)),
'model': lambda p: None
util.from_disk(path, deserializers, exclude)
@ -568,7 +567,7 @@ cdef class Parser:
('model', lambda: (self.model.to_bytes() if self.model is not True else True)),
('vocab', lambda: self.vocab.to_bytes()),
('moves', lambda: self.moves.to_bytes(strings=False)),
('cfg', lambda: json.dumps(self.cfg, indent=2, sort_keys=True))
('cfg', lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True))
return util.to_bytes(serializers, exclude)
@ -576,7 +575,7 @@ cdef class Parser:
deserializers = OrderedDict((
('vocab', lambda b: self.vocab.from_bytes(b)),
('moves', lambda b: self.moves.from_bytes(b, strings=False)),
('cfg', lambda b: self.cfg.update(json.loads(b))),
('cfg', lambda b: self.cfg.update(srsly.json_loads(b))),
('model', lambda b: None)
msg = util.from_bytes(bytes_data, deserializers, exclude)
@ -7,14 +7,13 @@ from cymem.cymem cimport Pool
from thinc.typedefs cimport weight_t
from thinc.extra.search cimport Beam
from collections import OrderedDict, Counter
import ujson
import srsly
from . cimport _beam_utils
from ..tokens.doc cimport Doc
from ..structs cimport TokenC
from .stateclass cimport StateClass
from ..typedefs cimport attr_t
from ..compat import json_dumps
from ..errors import Errors
from .. import util
@ -153,13 +152,13 @@ cdef class TransitionSystem:
# Make sure we take a copy here, and that we get a Counter
self.labels[action] = Counter()
# Have to be careful here: Sorting must be stable, or our model
# won't be read back in correctly.
# won't be read back in correctly.
sorted_labels = [(f, L) for L, f in label_freqs.items()]
for freq, label_str in sorted_labels:
self.add_action(int(action), label_str)
self.labels[action][label_str] = freq
self.labels[action][label_str] = freq
def add_action(self, int action, label_name):
cdef attr_t label_id
@ -204,7 +203,7 @@ cdef class TransitionSystem:
def to_bytes(self, **exclude):
transitions = []
serializers = {
'moves': lambda: json_dumps(self.labels),
'moves': lambda: srsly.json_dumps(self.labels),
'strings': lambda: self.strings.to_bytes()
return util.to_bytes(serializers, exclude)
@ -212,7 +211,7 @@ cdef class TransitionSystem:
def from_bytes(self, bytes_data, **exclude):
labels = {}
deserializers = {
'moves': lambda b: labels.update(ujson.loads(b)),
'moves': lambda b: labels.update(srsly.json_loads(b)),
'strings': lambda b: self.strings.from_bytes(b)
msg = util.from_bytes(bytes_data, deserializers, exclude)
@ -2,8 +2,8 @@
from __future__ import unicode_literals
import pytest
import dill as pickle
import numpy
import srsly
from spacy.strings import StringStore
from spacy.vocab import Vocab
from spacy.attrs import NORM
@ -14,8 +14,8 @@ def test_pickle_string_store(text1, text2):
stringstore = StringStore()
store1 = stringstore[text1]
store2 = stringstore[text2]
data = pickle.dumps(stringstore, protocol=-1)
unpickled = pickle.loads(data)
data = srsly.pickle_dumps(stringstore, protocol=-1)
unpickled = srsly.pickle_loads(data)
assert unpickled[text1] == store1
assert unpickled[text2] == store2
assert len(stringstore) == len(unpickled)
@ -29,8 +29,8 @@ def test_pickle_vocab(text1, text2):
lex2 = vocab[text2]
assert lex1.norm_ == text1[:-1]
assert lex2.norm_ == text2[:-1]
data = pickle.dumps(vocab)
unpickled = pickle.loads(data)
data = srsly.pickle_dumps(vocab)
unpickled = srsly.pickle_loads(data)
assert unpickled[text1].orth == lex1.orth
assert unpickled[text2].orth == lex2.orth
assert unpickled[text1].norm == lex1.norm
@ -5,7 +5,7 @@ import numpy
import tempfile
import shutil
import contextlib
import msgpack
import srsly
from pathlib import Path
from spacy.tokens import Doc, Span
from spacy.attrs import POS, HEAD, DEP
@ -100,8 +100,8 @@ def assert_docs_equal(doc1, doc2):
def assert_packed_msg_equal(b1, b2):
"""Assert that two packed msgpack messages are equal."""
msg1 = msgpack.loads(b1, encoding="utf8")
msg2 = msgpack.loads(b2, encoding="utf8")
msg1 = srsly.msgpack_loads(b1)
msg2 = srsly.msgpack_loads(b2)
assert sorted(msg1.keys()) == sorted(msg2.keys())
for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):
assert k1 == k2
@ -1,8 +1,8 @@
from __future__ import unicode_literals
import numpy
import msgpack
import gzip
import srsly
from thinc.neural.ops import NumpyOps
from ..compat import copy_reg
@ -74,11 +74,11 @@ class Binder(object):
"lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"),
"strings": list(self.strings),
return gzip.compress(msgpack.dumps(msg))
return gzip.compress(srsly.msgpack_dumps(msg))
def from_bytes(self, string):
"""Deserialize the binder's annotations from a byte string."""
msg = msgpack.loads(gzip.decompress(string))
msg = srsly.msgpack_loads(gzip.decompress(string))
self.attrs = msg["attrs"]
self.strings = set(msg["strings"])
lengths = numpy.fromstring(msg["lengths"], dtype="int32")
@ -9,9 +9,9 @@ cimport numpy as np
import numpy
import numpy.linalg
import struct
import dill
import msgpack
import srsly
from thinc.neural.util import get_array_module, copy_array
import srsly
from libc.string cimport memcpy, memset
from libc.math cimport sqrt
@ -28,7 +28,7 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
from ..attrs cimport ENT_TYPE, SENT_START
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
from ..util import normalize_slice, is_json_serializable
from ..util import normalize_slice
from ..compat import is_config, copy_reg, pickle, basestring_
from ..errors import deprecation_warning, models_warning, user_warning
from ..errors import Errors, Warnings
@ -807,8 +807,8 @@ cdef class Doc:
if 'user_data' not in exclude and self.user_data:
user_data_keys, user_data_values = list(zip(*self.user_data.items()))
serializers['user_data_keys'] = lambda: msgpack.dumps(user_data_keys)
serializers['user_data_values'] = lambda: msgpack.dumps(user_data_values)
serializers['user_data_keys'] = lambda: srsly.msgpack_dumps(user_data_keys)
serializers['user_data_values'] = lambda: srsly.msgpack_dumps(user_data_values)
return util.to_bytes(serializers, exclude)
@ -836,9 +836,8 @@ cdef class Doc:
# keys, we must have tuples. In values we just have to hope
# users don't mind getting a list instead of a tuple.
if 'user_data' not in exclude and 'user_data_keys' in msg:
user_data_keys = msgpack.loads(msg['user_data_keys'],
use_list=False, raw=False)
user_data_values = msgpack.loads(msg['user_data_values'], raw=False)
user_data_keys = srsly.msgpack_loads(msg['user_data_keys'], use_list=False)
user_data_values = srsly.msgpack_loads(msg['user_data_values'])
for key, value in zip(user_data_keys, user_data_values):
self.user_data[key] = value
@ -996,7 +995,7 @@ cdef class Doc:
if not self.has_extension(attr):
raise ValueError(Errors.E106.format(attr=attr, opts=underscore))
value = self._.get(attr)
if not is_json_serializable(value):
if not srsly.is_json_serializable(value):
raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
data['_'][attr] = value
return data
@ -1062,11 +1061,11 @@ def pickle_doc(doc):
bytes_data = doc.to_bytes(vocab=False, user_data=False)
hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks,
return (unpickle_doc, (doc.vocab, dill.dumps(hooks_and_data), bytes_data))
return (unpickle_doc, (doc.vocab, srsly.pickle_dumps(hooks_and_data), bytes_data))
def unpickle_doc(vocab, hooks_and_data, bytes_data):
user_data, doc_hooks, span_hooks, token_hooks = dill.loads(hooks_and_data)
user_data, doc_hooks, span_hooks, token_hooks = srsly.pickle_loads(hooks_and_data)
doc = Doc(vocab, user_data=user_data).from_bytes(bytes_data,
@ -2,7 +2,6 @@
from __future__ import unicode_literals, print_function
import os
import ujson
import pkg_resources
import importlib
import regex as re
@ -12,21 +11,15 @@ from collections import OrderedDict
from thinc.neural._classes.model import Model
from thinc.neural.ops import NumpyOps
import functools
import cytoolz
import itertools
import numpy.random
import srsly
from .symbols import ORTH
from .compat import cupy, CudaStream, path2str, basestring_, unicode_
from .compat import import_file, json_dumps
from .compat import import_file
from .errors import Errors
# Import these directly from Thinc, so that we're sure we always have the
# same version.
from thinc.neural._classes.model import msgpack # noqa: F401
from thinc.neural._classes.model import msgpack_numpy # noqa: F401
_data_path = Path(__file__).parent / "data"
@ -185,7 +178,7 @@ def get_model_meta(path):
meta_path = model_path / "meta.json"
if not meta_path.is_file():
raise IOError(Errors.E053.format(path=meta_path))
meta = read_json(meta_path)
meta = srsly.read_json(meta_path)
for setting in ["lang", "name", "version"]:
if setting not in meta or not meta[setting]:
raise ValueError(Errors.E054.format(setting=setting))
@ -409,7 +402,7 @@ def minibatch(items, size=8):
items = iter(items)
while True:
batch_size = next(size_)
batch = list(cytoolz.take(int(batch_size), items))
batch = list(itertools.islice(items, int(batch_size)))
if len(batch) == 0:
yield list(batch)
@ -529,74 +522,16 @@ def itershuffle(iterable, bufsize=1000):
raise StopIteration
def read_json(location):
"""Open and load JSON from file.
location (Path): Path to JSON file.
RETURNS (dict): Loaded JSON content.
location = ensure_path(location)
with location.open("r", encoding="utf8") as f:
return ujson.load(f)
def write_json(file_path, contents):
"""Create a .json file and dump contents.
file_path (unicode / Path): The path to the output file.
contents: The JSON-serializable contents to output.
with Path(file_path).open("w", encoding="utf8") as f:
def read_jsonl(file_path):
"""Read a .jsonl file and yield its contents line by line.
file_path (unicode / Path): The file path.
YIELDS: The loaded JSON contents of each line.
with Path(file_path).open("r", encoding="utf8") as f:
for line in f:
try: # hack to handle broken jsonl
yield ujson.loads(line.strip())
except ValueError:
def write_jsonl(file_path, lines):
"""Create a .jsonl file and dump contents.
file_path (unicode / Path): The path to the output file.
lines (list): The JSON-serializable contents of each line.
data = [json_dumps(line) for line in lines]
with Path(file_path).open("w", encoding="utf-8") as f:
def is_json_serializable(obj):
"""Check if a Python object is JSON-serializable."""
if hasattr(obj, "__call__"):
# Check this separately here to prevent infinite recursions
return False
return True
except TypeError:
return False
def to_bytes(getters, exclude):
serialized = OrderedDict()
for key, getter in getters.items():
if key not in exclude:
serialized[key] = getter()
return msgpack.dumps(serialized, use_bin_type=True)
return srsly.msgpack_dumps(serialized)
def from_bytes(bytes_data, setters, exclude):
msg = msgpack.loads(bytes_data, raw=False)
msg = srsly.msgpack_loads(bytes_data)
for key, setter in setters.items():
if key not in exclude and key in msg:
@ -4,9 +4,7 @@ from __future__ import unicode_literals
import functools
import numpy
from collections import OrderedDict
from .util import msgpack
from .util import msgpack_numpy
import srsly
cimport numpy as np
from thinc.neural.util import get_array_module
@ -353,7 +351,7 @@ cdef class Vectors:
save_array = lambda arr, file_: xp.save(file_, arr)
serializers = OrderedDict((
('vectors', lambda p: save_array(self.data, p.open('wb'))),
('key2row', lambda p: msgpack.dump(self.key2row, p.open('wb')))
('key2row', lambda p: srsly.write_msgpack(p, self.key2row))
return util.to_disk(path, serializers, exclude)
@ -366,8 +364,7 @@ cdef class Vectors:
def load_key2row(path):
if path.exists():
with path.open('rb') as file_:
self.key2row = msgpack.load(file_)
self.key2row = srsly.read_msgpack(path)
for key, row in self.key2row.items():
if self._unset.count(row):
@ -401,9 +398,9 @@ cdef class Vectors:
if hasattr(self.data, 'to_bytes'):
return self.data.to_bytes()
return msgpack.dumps(self.data)
return srsly.msgpack_dumps(self.data)
serializers = OrderedDict((
('key2row', lambda: msgpack.dumps(self.key2row)),
('key2row', lambda: srsly.msgpack_dumps(self.key2row)),
('vectors', serialize_weights)
return util.to_bytes(serializers, exclude)
@ -419,10 +416,10 @@ cdef class Vectors:
if hasattr(self.data, 'from_bytes'):
self.data = msgpack.loads(b)
self.data = srsly.msgpack_loads(b)
deserializers = OrderedDict((
('key2row', lambda b: self.key2row.update(msgpack.loads(b))),
('key2row', lambda b: self.key2row.update(srsly.msgpack_loads(b))),
('vectors', deserialize_weights)
util.from_bytes(data, deserializers, exclude)
@ -3,7 +3,7 @@
from __future__ import unicode_literals
import numpy
import dill
import srsly
from collections import OrderedDict
from thinc.neural.util import get_array_module
@ -513,7 +513,7 @@ def pickle_vocab(vocab):
morph = vocab.morphology
length = vocab.length
data_dir = vocab.data_dir
lex_attr_getters = dill.dumps(vocab.lex_attr_getters)
lex_attr_getters = srsly.pickle_dumps(vocab.lex_attr_getters)
lexemes_data = vocab.lexemes_to_bytes()
return (unpickle_vocab,
(sstore, vectors, morph, data_dir, lex_attr_getters, lexemes_data, length))
@ -527,7 +527,7 @@ def unpickle_vocab(sstore, vectors, morphology, data_dir,
vocab.strings = sstore
vocab.morphology = morphology
vocab.data_dir = data_dir
vocab.lex_attr_getters = dill.loads(lex_attr_getters)
vocab.lex_attr_getters = srsly.pickle_loads(lex_attr_getters)
vocab.length = length
return vocab
@ -9,10 +9,9 @@ p
| underscore, e.e #[code unicode_].
from spacy.compat import unicode_, json_dumps
from spacy.compat import unicode_
compatible_unicode = unicode_('hello world')
compatible_json = json_dumps({'key': 'value'})
+table(["Name", "Python 2", "Python 3"])
@ -35,11 +34,6 @@ p
+cell #[code raw_input]
+cell #[code input]
+cell #[code compat.json_dumps]
+cell #[code ujson.dumps] with #[code .decode('utf8')]
+cell #[code ujson.dumps]
+cell #[code compat.path2str]
+cell #[code str(path)] with #[code .decode('utf8')]
Reference in New Issue
Block a user