Auto-format examples

This commit is contained in:
Ines Montani 2018-12-02 04:26:26 +01:00
parent 6f2d3c863a
commit 45798cc53e
22 changed files with 708 additions and 506 deletions

View File

@ -1,5 +1,12 @@
""" """
This example shows how to use an LSTM sentiment classification model trained using Keras in spaCy. spaCy splits the document into sentences, and each sentence is classified using the LSTM. The scores for the sentences are then aggregated to give the document score. This kind of hierarchical model is quite difficult in "pure" Keras or Tensorflow, but it's very effective. The Keras example on this dataset performs quite poorly, because it cuts off the documents so that they're a fixed size. This hurts review accuracy a lot, because people often summarise their rating in the final sentence This example shows how to use an LSTM sentiment classification model trained
using Keras in spaCy. spaCy splits the document into sentences, and each
sentence is classified using the LSTM. The scores for the sentences are then
aggregated to give the document score. This kind of hierarchical model is quite
difficult in "pure" Keras or Tensorflow, but it's very effective. The Keras
example on this dataset performs quite poorly, because it cuts off the documents
so that they're a fixed size. This hurts review accuracy a lot, because people
often summarise their rating in the final sentence
Prerequisites: Prerequisites:
spacy download en_vectors_web_lg spacy download en_vectors_web_lg
@ -25,9 +32,9 @@ import spacy
class SentimentAnalyser(object): class SentimentAnalyser(object):
@classmethod @classmethod
def load(cls, path, nlp, max_length=100): def load(cls, path, nlp, max_length=100):
with (path / 'config.json').open() as file_: with (path / "config.json").open() as file_:
model = model_from_json(file_.read()) model = model_from_json(file_.read())
with (path / 'model').open('rb') as file_: with (path / "model").open("rb") as file_:
lstm_weights = pickle.load(file_) lstm_weights = pickle.load(file_)
embeddings = get_embeddings(nlp.vocab) embeddings = get_embeddings(nlp.vocab)
model.set_weights([embeddings] + lstm_weights) model.set_weights([embeddings] + lstm_weights)
@ -69,12 +76,12 @@ def get_labelled_sentences(docs, doc_labels):
for sent in doc.sents: for sent in doc.sents:
sentences.append(sent) sentences.append(sent)
labels.append(y) labels.append(y)
return sentences, numpy.asarray(labels, dtype='int32') return sentences, numpy.asarray(labels, dtype="int32")
def get_features(docs, max_length): def get_features(docs, max_length):
docs = list(docs) docs = list(docs)
Xs = numpy.zeros((len(docs), max_length), dtype='int32') Xs = numpy.zeros((len(docs), max_length), dtype="int32")
for i, doc in enumerate(docs): for i, doc in enumerate(docs):
j = 0 j = 0
for token in doc: for token in doc:
@ -89,13 +96,22 @@ def get_features(docs, max_length):
return Xs return Xs
def train(train_texts, train_labels, dev_texts, dev_labels, def train(
lstm_shape, lstm_settings, lstm_optimizer, batch_size=100, train_texts,
nb_epoch=5, by_sentence=True): train_labels,
dev_texts,
dev_labels,
lstm_shape,
lstm_settings,
lstm_optimizer,
batch_size=100,
nb_epoch=5,
by_sentence=True,
):
print("Loading spaCy") print("Loading spaCy")
nlp = spacy.load('en_vectors_web_lg') nlp = spacy.load("en_vectors_web_lg")
nlp.add_pipe(nlp.create_pipe('sentencizer')) nlp.add_pipe(nlp.create_pipe("sentencizer"))
embeddings = get_embeddings(nlp.vocab) embeddings = get_embeddings(nlp.vocab)
model = compile_lstm(embeddings, lstm_shape, lstm_settings) model = compile_lstm(embeddings, lstm_shape, lstm_settings)
@ -106,10 +122,15 @@ def train(train_texts, train_labels, dev_texts, dev_labels,
train_docs, train_labels = get_labelled_sentences(train_docs, train_labels) train_docs, train_labels = get_labelled_sentences(train_docs, train_labels)
dev_docs, dev_labels = get_labelled_sentences(dev_docs, dev_labels) dev_docs, dev_labels = get_labelled_sentences(dev_docs, dev_labels)
train_X = get_features(train_docs, lstm_shape['max_length']) train_X = get_features(train_docs, lstm_shape["max_length"])
dev_X = get_features(dev_docs, lstm_shape['max_length']) dev_X = get_features(dev_docs, lstm_shape["max_length"])
model.fit(train_X, train_labels, validation_data=(dev_X, dev_labels), model.fit(
epochs=nb_epoch, batch_size=batch_size) train_X,
train_labels,
validation_data=(dev_X, dev_labels),
epochs=nb_epoch,
batch_size=batch_size,
)
return model return model
@ -119,19 +140,28 @@ def compile_lstm(embeddings, shape, settings):
Embedding( Embedding(
embeddings.shape[0], embeddings.shape[0],
embeddings.shape[1], embeddings.shape[1],
input_length=shape['max_length'], input_length=shape["max_length"],
trainable=False, trainable=False,
weights=[embeddings], weights=[embeddings],
mask_zero=True mask_zero=True,
) )
) )
model.add(TimeDistributed(Dense(shape['nr_hidden'], use_bias=False))) model.add(TimeDistributed(Dense(shape["nr_hidden"], use_bias=False)))
model.add(Bidirectional(LSTM(shape['nr_hidden'], model.add(
recurrent_dropout=settings['dropout'], Bidirectional(
dropout=settings['dropout']))) LSTM(
model.add(Dense(shape['nr_class'], activation='sigmoid')) shape["nr_hidden"],
model.compile(optimizer=Adam(lr=settings['lr']), loss='binary_crossentropy', recurrent_dropout=settings["dropout"],
metrics=['accuracy']) dropout=settings["dropout"],
)
)
)
model.add(Dense(shape["nr_class"], activation="sigmoid"))
model.compile(
optimizer=Adam(lr=settings["lr"]),
loss="binary_crossentropy",
metrics=["accuracy"],
)
return model return model
@ -140,8 +170,8 @@ def get_embeddings(vocab):
def evaluate(model_dir, texts, labels, max_length=100): def evaluate(model_dir, texts, labels, max_length=100):
nlp = spacy.load('en_vectors_web_lg') nlp = spacy.load("en_vectors_web_lg")
nlp.add_pipe(nlp.create_pipe('sentencizer')) nlp.add_pipe(nlp.create_pipe("sentencizer"))
nlp.add_pipe(SentimentAnalyser.load(model_dir, nlp, max_length=max_length)) nlp.add_pipe(SentimentAnalyser.load(model_dir, nlp, max_length=max_length))
correct = 0 correct = 0
@ -154,7 +184,7 @@ def evaluate(model_dir, texts, labels, max_length=100):
def read_data(data_dir, limit=0): def read_data(data_dir, limit=0):
examples = [] examples = []
for subdir, label in (('pos', 1), ('neg', 0)): for subdir, label in (("pos", 1), ("neg", 0)):
for filename in (data_dir / subdir).iterdir(): for filename in (data_dir / subdir).iterdir():
with filename.open() as file_: with filename.open() as file_:
text = file_.read() text = file_.read()
@ -176,13 +206,21 @@ def read_data(data_dir, limit=0):
learn_rate=("Learn rate", "option", "e", float), learn_rate=("Learn rate", "option", "e", float),
nb_epoch=("Number of training epochs", "option", "i", int), nb_epoch=("Number of training epochs", "option", "i", int),
batch_size=("Size of minibatches for training LSTM", "option", "b", int), batch_size=("Size of minibatches for training LSTM", "option", "b", int),
nr_examples=("Limit to N examples", "option", "n", int) nr_examples=("Limit to N examples", "option", "n", int),
) )
def main(model_dir=None, train_dir=None, dev_dir=None, def main(
model_dir=None,
train_dir=None,
dev_dir=None,
is_runtime=False, is_runtime=False,
nr_hidden=64, max_length=100, # Shape nr_hidden=64,
dropout=0.5, learn_rate=0.001, # General NN config max_length=100, # Shape
nb_epoch=5, batch_size=256, nr_examples=-1): # Training params dropout=0.5,
learn_rate=0.001, # General NN config
nb_epoch=5,
batch_size=256,
nr_examples=-1,
): # Training params
if model_dir is not None: if model_dir is not None:
model_dir = pathlib.Path(model_dir) model_dir = pathlib.Path(model_dir)
if train_dir is None or dev_dir is None: if train_dir is None or dev_dir is None:
@ -204,20 +242,26 @@ def main(model_dir=None, train_dir=None, dev_dir=None,
dev_texts, dev_labels = zip(*imdb_data[1]) dev_texts, dev_labels = zip(*imdb_data[1])
else: else:
dev_texts, dev_labels = read_data(dev_dir, imdb_data, limit=nr_examples) dev_texts, dev_labels = read_data(dev_dir, imdb_data, limit=nr_examples)
train_labels = numpy.asarray(train_labels, dtype='int32') train_labels = numpy.asarray(train_labels, dtype="int32")
dev_labels = numpy.asarray(dev_labels, dtype='int32') dev_labels = numpy.asarray(dev_labels, dtype="int32")
lstm = train(train_texts, train_labels, dev_texts, dev_labels, lstm = train(
{'nr_hidden': nr_hidden, 'max_length': max_length, 'nr_class': 1}, train_texts,
{'dropout': dropout, 'lr': learn_rate}, train_labels,
dev_texts,
dev_labels,
{"nr_hidden": nr_hidden, "max_length": max_length, "nr_class": 1},
{"dropout": dropout, "lr": learn_rate},
{}, {},
nb_epoch=nb_epoch, batch_size=batch_size) nb_epoch=nb_epoch,
batch_size=batch_size,
)
weights = lstm.get_weights() weights = lstm.get_weights()
if model_dir is not None: if model_dir is not None:
with (model_dir / 'model').open('wb') as file_: with (model_dir / "model").open("wb") as file_:
pickle.dump(weights[1:], file_) pickle.dump(weights[1:], file_)
with (model_dir / 'config.json').open('w') as file_: with (model_dir / "config.json").open("w") as file_:
file_.write(lstm.to_json()) file_.write(lstm.to_json())
if __name__ == '__main__': if __name__ == "__main__":
plac.call(main) plac.call(main)

View File

@ -15,14 +15,15 @@ import spacy
TEXTS = [ TEXTS = [
'Net income was $9.4 million compared to the prior year of $2.7 million.', "Net income was $9.4 million compared to the prior year of $2.7 million.",
'Revenue exceeded twelve billion dollars, with a loss of $1b.', "Revenue exceeded twelve billion dollars, with a loss of $1b.",
] ]
@plac.annotations( @plac.annotations(
model=("Model to load (needs parser and NER)", "positional", None, str)) model=("Model to load (needs parser and NER)", "positional", None, str)
def main(model='en_core_web_sm'): )
def main(model="en_core_web_sm"):
nlp = spacy.load(model) nlp = spacy.load(model)
print("Loaded model '%s'" % model) print("Loaded model '%s'" % model)
print("Processing %d texts" % len(TEXTS)) print("Processing %d texts" % len(TEXTS))
@ -31,7 +32,7 @@ def main(model='en_core_web_sm'):
doc = nlp(text) doc = nlp(text)
relations = extract_currency_relations(doc) relations = extract_currency_relations(doc)
for r1, r2 in relations: for r1, r2 in relations:
print('{:<10}\t{}\t{}'.format(r1.text, r2.ent_type_, r2.text)) print("{:<10}\t{}\t{}".format(r1.text, r2.ent_type_, r2.text))
def extract_currency_relations(doc): def extract_currency_relations(doc):
@ -41,18 +42,18 @@ def extract_currency_relations(doc):
span.merge() span.merge()
relations = [] relations = []
for money in filter(lambda w: w.ent_type_ == 'MONEY', doc): for money in filter(lambda w: w.ent_type_ == "MONEY", doc):
if money.dep_ in ('attr', 'dobj'): if money.dep_ in ("attr", "dobj"):
subject = [w for w in money.head.lefts if w.dep_ == 'nsubj'] subject = [w for w in money.head.lefts if w.dep_ == "nsubj"]
if subject: if subject:
subject = subject[0] subject = subject[0]
relations.append((subject, money)) relations.append((subject, money))
elif money.dep_ == 'pobj' and money.head.dep_ == 'prep': elif money.dep_ == "pobj" and money.head.dep_ == "prep":
relations.append((money.head.head, money)) relations.append((money.head.head, money))
return relations return relations
if __name__ == '__main__': if __name__ == "__main__":
plac.call(main) plac.call(main)
# Expected output: # Expected output:

View File

@ -24,37 +24,39 @@ import plac
import spacy import spacy
@plac.annotations( @plac.annotations(model=("Model to load", "positional", None, str))
model=("Model to load", "positional", None, str)) def main(model="en_core_web_sm"):
def main(model='en_core_web_sm'):
nlp = spacy.load(model) nlp = spacy.load(model)
print("Loaded model '%s'" % model) print("Loaded model '%s'" % model)
doc = nlp("displaCy uses CSS and JavaScript to show you how computers " doc = nlp(
"understand language") "displaCy uses CSS and JavaScript to show you how computers "
"understand language"
)
# The easiest way is to find the head of the subtree you want, and then use # The easiest way is to find the head of the subtree you want, and then use
# the `.subtree`, `.children`, `.lefts` and `.rights` iterators. `.subtree` # the `.subtree`, `.children`, `.lefts` and `.rights` iterators. `.subtree`
# is the one that does what you're asking for most directly: # is the one that does what you're asking for most directly:
for word in doc: for word in doc:
if word.dep_ in ('xcomp', 'ccomp'): if word.dep_ in ("xcomp", "ccomp"):
print(''.join(w.text_with_ws for w in word.subtree)) print("".join(w.text_with_ws for w in word.subtree))
# It'd probably be better for `word.subtree` to return a `Span` object # It'd probably be better for `word.subtree` to return a `Span` object
# instead of a generator over the tokens. If you want the `Span` you can # instead of a generator over the tokens. If you want the `Span` you can
# get it via the `.right_edge` and `.left_edge` properties. The `Span` # get it via the `.right_edge` and `.left_edge` properties. The `Span`
# object is nice because you can easily get a vector, merge it, etc. # object is nice because you can easily get a vector, merge it, etc.
for word in doc: for word in doc:
if word.dep_ in ('xcomp', 'ccomp'): if word.dep_ in ("xcomp", "ccomp"):
subtree_span = doc[word.left_edge.i : word.right_edge.i + 1] subtree_span = doc[word.left_edge.i : word.right_edge.i + 1]
print(subtree_span.text, '|', subtree_span.root.text) print(subtree_span.text, "|", subtree_span.root.text)
# You might also want to select a head, and then select a start and end # You might also want to select a head, and then select a start and end
# position by walking along its children. You could then take the # position by walking along its children. You could then take the
# `.left_edge` and `.right_edge` of those tokens, and use it to calculate # `.left_edge` and `.right_edge` of those tokens, and use it to calculate
# a span. # a span.
if __name__ == '__main__':
if __name__ == "__main__":
plac.call(main) plac.call(main)
# Expected output: # Expected output:

View File

@ -19,39 +19,40 @@ from pathlib import Path
@plac.annotations( @plac.annotations(
output_dir=("Output directory for saved HTML", "positional", None, Path)) output_dir=("Output directory for saved HTML", "positional", None, Path)
)
def main(output_dir=None): def main(output_dir=None):
nlp = English() # start off with blank English class nlp = English() # start off with blank English class
Doc.set_extension('overlap', method=overlap_tokens) Doc.set_extension("overlap", method=overlap_tokens)
doc1 = nlp(u"Peach emoji is where it has always been.") doc1 = nlp("Peach emoji is where it has always been.")
doc2 = nlp(u"Peach is the superior emoji.") doc2 = nlp("Peach is the superior emoji.")
print("Text 1:", doc1.text) print("Text 1:", doc1.text)
print("Text 2:", doc2.text) print("Text 2:", doc2.text)
print("Overlapping tokens:", doc1._.overlap(doc2)) print("Overlapping tokens:", doc1._.overlap(doc2))
Doc.set_extension('to_html', method=to_html) Doc.set_extension("to_html", method=to_html)
doc = nlp(u"This is a sentence about Apple.") doc = nlp("This is a sentence about Apple.")
# add entity manually for demo purposes, to make it work without a model # add entity manually for demo purposes, to make it work without a model
doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])] doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings["ORG"])]
print("Text:", doc.text) print("Text:", doc.text)
doc._.to_html(output=output_dir, style='ent') doc._.to_html(output=output_dir, style="ent")
def to_html(doc, output='/tmp', style='dep'): def to_html(doc, output="/tmp", style="dep"):
"""Doc method extension for saving the current state as a displaCy """Doc method extension for saving the current state as a displaCy
visualization. visualization.
""" """
# generate filename from first six non-punct tokens # generate filename from first six non-punct tokens
file_name = '-'.join([w.text for w in doc[:6] if not w.is_punct]) + '.html' file_name = "-".join([w.text for w in doc[:6] if not w.is_punct]) + ".html"
html = displacy.render(doc, style=style, page=True) # render markup html = displacy.render(doc, style=style, page=True) # render markup
if output is not None: if output is not None:
output_path = Path(output) output_path = Path(output)
if not output_path.exists(): if not output_path.exists():
output_path.mkdir() output_path.mkdir()
output_file = Path(output) / file_name output_file = Path(output) / file_name
output_file.open('w', encoding='utf-8').write(html) # save to file output_file.open("w", encoding="utf-8").write(html) # save to file
print('Saved HTML to {}'.format(output_file)) print("Saved HTML to {}".format(output_file))
else: else:
print(html) print(html)
@ -67,7 +68,7 @@ def overlap_tokens(doc, other_doc):
return overlap return overlap
if __name__ == '__main__': if __name__ == "__main__":
plac.call(main) plac.call(main)
# Expected output: # Expected output:

View File

@ -26,14 +26,18 @@ def main():
nlp = English() nlp = English()
rest_countries = RESTCountriesComponent(nlp) # initialise component rest_countries = RESTCountriesComponent(nlp) # initialise component
nlp.add_pipe(rest_countries) # add it to the pipeline nlp.add_pipe(rest_countries) # add it to the pipeline
doc = nlp(u"Some text about Colombia and the Czech Republic") doc = nlp("Some text about Colombia and the Czech Republic")
print('Pipeline', nlp.pipe_names) # pipeline contains component name print("Pipeline", nlp.pipe_names) # pipeline contains component name
print('Doc has countries', doc._.has_country) # Doc contains countries print("Doc has countries", doc._.has_country) # Doc contains countries
for token in doc: for token in doc:
if token._.is_country: if token._.is_country:
print(token.text, token._.country_capital, token._.country_latlng, print(
token._.country_flag) # country data token.text,
print('Entities', [(e.text, e.label_) for e in doc.ents]) # entities token._.country_capital,
token._.country_latlng,
token._.country_flag,
) # country data
print("Entities", [(e.text, e.label_) for e in doc.ents]) # entities
class RESTCountriesComponent(object): class RESTCountriesComponent(object):
@ -41,42 +45,42 @@ class RESTCountriesComponent(object):
the REST Countries API, merges country names into one token, assigns entity the REST Countries API, merges country names into one token, assigns entity
labels and sets attributes on country tokens. labels and sets attributes on country tokens.
""" """
name = 'rest_countries' # component name, will show up in the pipeline
def __init__(self, nlp, label='GPE'): name = "rest_countries" # component name, will show up in the pipeline
def __init__(self, nlp, label="GPE"):
"""Initialise the pipeline component. The shared nlp instance is used """Initialise the pipeline component. The shared nlp instance is used
to initialise the matcher with the shared vocab, get the label ID and to initialise the matcher with the shared vocab, get the label ID and
generate Doc objects as phrase match patterns. generate Doc objects as phrase match patterns.
""" """
# Make request once on initialisation and store the data # Make request once on initialisation and store the data
r = requests.get('https://restcountries.eu/rest/v2/all') r = requests.get("https://restcountries.eu/rest/v2/all")
r.raise_for_status() # make sure requests raises an error if it fails r.raise_for_status() # make sure requests raises an error if it fails
countries = r.json() countries = r.json()
# Convert API response to dict keyed by country name for easy lookup # Convert API response to dict keyed by country name for easy lookup
# This could also be extended using the alternative and foreign language # This could also be extended using the alternative and foreign language
# names provided by the API # names provided by the API
self.countries = {c['name']: c for c in countries} self.countries = {c["name"]: c for c in countries}
self.label = nlp.vocab.strings[label] # get entity label ID self.label = nlp.vocab.strings[label] # get entity label ID
# Set up the PhraseMatcher with Doc patterns for each country name # Set up the PhraseMatcher with Doc patterns for each country name
patterns = [nlp(c) for c in self.countries.keys()] patterns = [nlp(c) for c in self.countries.keys()]
self.matcher = PhraseMatcher(nlp.vocab) self.matcher = PhraseMatcher(nlp.vocab)
self.matcher.add('COUNTRIES', None, *patterns) self.matcher.add("COUNTRIES", None, *patterns)
# Register attribute on the Token. We'll be overwriting this based on # Register attribute on the Token. We'll be overwriting this based on
# the matches, so we're only setting a default value, not a getter. # the matches, so we're only setting a default value, not a getter.
# If no default value is set, it defaults to None. # If no default value is set, it defaults to None.
Token.set_extension('is_country', default=False) Token.set_extension("is_country", default=False)
Token.set_extension('country_capital', default=False) Token.set_extension("country_capital", default=False)
Token.set_extension('country_latlng', default=False) Token.set_extension("country_latlng", default=False)
Token.set_extension('country_flag', default=False) Token.set_extension("country_flag", default=False)
# Register attributes on Doc and Span via a getter that checks if one of # Register attributes on Doc and Span via a getter that checks if one of
# the contained tokens is set to is_country == True. # the contained tokens is set to is_country == True.
Doc.set_extension('has_country', getter=self.has_country) Doc.set_extension("has_country", getter=self.has_country)
Span.set_extension('has_country', getter=self.has_country) Span.set_extension("has_country", getter=self.has_country)
def __call__(self, doc): def __call__(self, doc):
"""Apply the pipeline component on a Doc object and modify it if matches """Apply the pipeline component on a Doc object and modify it if matches
@ -93,10 +97,10 @@ class RESTCountriesComponent(object):
# Can be extended with other data returned by the API, like # Can be extended with other data returned by the API, like
# currencies, country code, flag, calling code etc. # currencies, country code, flag, calling code etc.
for token in entity: for token in entity:
token._.set('is_country', True) token._.set("is_country", True)
token._.set('country_capital', self.countries[entity.text]['capital']) token._.set("country_capital", self.countries[entity.text]["capital"])
token._.set('country_latlng', self.countries[entity.text]['latlng']) token._.set("country_latlng", self.countries[entity.text]["latlng"])
token._.set('country_flag', self.countries[entity.text]['flag']) token._.set("country_flag", self.countries[entity.text]["flag"])
# Overwrite doc.ents and add entity be careful not to replace! # Overwrite doc.ents and add entity be careful not to replace!
doc.ents = list(doc.ents) + [entity] doc.ents = list(doc.ents) + [entity]
for span in spans: for span in spans:
@ -111,10 +115,10 @@ class RESTCountriesComponent(object):
is a country. Since the getter is only called when we access the is a country. Since the getter is only called when we access the
attribute, we can refer to the Token's 'is_country' attribute here, attribute, we can refer to the Token's 'is_country' attribute here,
which is already set in the processing step.""" which is already set in the processing step."""
return any([t._.get('is_country') for t in tokens]) return any([t._.get("is_country") for t in tokens])
if __name__ == '__main__': if __name__ == "__main__":
plac.call(main) plac.call(main)
# Expected output: # Expected output:

View File

@ -20,23 +20,24 @@ from spacy.tokens import Doc, Span, Token
@plac.annotations( @plac.annotations(
text=("Text to process", "positional", None, str), text=("Text to process", "positional", None, str),
companies=("Names of technology companies", "positional", None, str)) companies=("Names of technology companies", "positional", None, str),
)
def main(text="Alphabet Inc. is the company behind Google.", *companies): def main(text="Alphabet Inc. is the company behind Google.", *companies):
# For simplicity, we start off with only the blank English Language class # For simplicity, we start off with only the blank English Language class
# and no model or pre-defined pipeline loaded. # and no model or pre-defined pipeline loaded.
nlp = English() nlp = English()
if not companies: # set default companies if none are set via args if not companies: # set default companies if none are set via args
companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple'] # etc. companies = ["Alphabet Inc.", "Google", "Netflix", "Apple"] # etc.
component = TechCompanyRecognizer(nlp, companies) # initialise component component = TechCompanyRecognizer(nlp, companies) # initialise component
nlp.add_pipe(component, last=True) # add last to the pipeline nlp.add_pipe(component, last=True) # add last to the pipeline
doc = nlp(text) doc = nlp(text)
print('Pipeline', nlp.pipe_names) # pipeline contains component name print("Pipeline", nlp.pipe_names) # pipeline contains component name
print('Tokens', [t.text for t in doc]) # company names from the list are merged print("Tokens", [t.text for t in doc]) # company names from the list are merged
print('Doc has_tech_org', doc._.has_tech_org) # Doc contains tech orgs print("Doc has_tech_org", doc._.has_tech_org) # Doc contains tech orgs
print('Token 0 is_tech_org', doc[0]._.is_tech_org) # "Alphabet Inc." is a tech org print("Token 0 is_tech_org", doc[0]._.is_tech_org) # "Alphabet Inc." is a tech org
print('Token 1 is_tech_org', doc[1]._.is_tech_org) # "is" is not print("Token 1 is_tech_org", doc[1]._.is_tech_org) # "is" is not
print('Entities', [(e.text, e.label_) for e in doc.ents]) # all orgs are entities print("Entities", [(e.text, e.label_) for e in doc.ents]) # all orgs are entities
class TechCompanyRecognizer(object): class TechCompanyRecognizer(object):
@ -45,9 +46,10 @@ class TechCompanyRecognizer(object):
labelled as ORG and their spans are merged into one token. Additionally, labelled as ORG and their spans are merged into one token. Additionally,
._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token ._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token
respectively.""" respectively."""
name = 'tech_companies' # component name, will show up in the pipeline
def __init__(self, nlp, companies=tuple(), label='ORG'): name = "tech_companies" # component name, will show up in the pipeline
def __init__(self, nlp, companies=tuple(), label="ORG"):
"""Initialise the pipeline component. The shared nlp instance is used """Initialise the pipeline component. The shared nlp instance is used
to initialise the matcher with the shared vocab, get the label ID and to initialise the matcher with the shared vocab, get the label ID and
generate Doc objects as phrase match patterns. generate Doc objects as phrase match patterns.
@ -58,16 +60,16 @@ class TechCompanyRecognizer(object):
# so even if the list of companies is long, it's very efficient # so even if the list of companies is long, it's very efficient
patterns = [nlp(org) for org in companies] patterns = [nlp(org) for org in companies]
self.matcher = PhraseMatcher(nlp.vocab) self.matcher = PhraseMatcher(nlp.vocab)
self.matcher.add('TECH_ORGS', None, *patterns) self.matcher.add("TECH_ORGS", None, *patterns)
# Register attribute on the Token. We'll be overwriting this based on # Register attribute on the Token. We'll be overwriting this based on
# the matches, so we're only setting a default value, not a getter. # the matches, so we're only setting a default value, not a getter.
Token.set_extension('is_tech_org', default=False) Token.set_extension("is_tech_org", default=False)
# Register attributes on Doc and Span via a getter that checks if one of # Register attributes on Doc and Span via a getter that checks if one of
# the contained tokens is set to is_tech_org == True. # the contained tokens is set to is_tech_org == True.
Doc.set_extension('has_tech_org', getter=self.has_tech_org) Doc.set_extension("has_tech_org", getter=self.has_tech_org)
Span.set_extension('has_tech_org', getter=self.has_tech_org) Span.set_extension("has_tech_org", getter=self.has_tech_org)
def __call__(self, doc): def __call__(self, doc):
"""Apply the pipeline component on a Doc object and modify it if matches """Apply the pipeline component on a Doc object and modify it if matches
@ -82,7 +84,7 @@ class TechCompanyRecognizer(object):
spans.append(entity) spans.append(entity)
# Set custom attribute on each token of the entity # Set custom attribute on each token of the entity
for token in entity: for token in entity:
token._.set('is_tech_org', True) token._.set("is_tech_org", True)
# Overwrite doc.ents and add entity be careful not to replace! # Overwrite doc.ents and add entity be careful not to replace!
doc.ents = list(doc.ents) + [entity] doc.ents = list(doc.ents) + [entity]
for span in spans: for span in spans:
@ -97,10 +99,10 @@ class TechCompanyRecognizer(object):
is a tech org. Since the getter is only called when we access the is a tech org. Since the getter is only called when we access the
attribute, we can refer to the Token's 'is_tech_org' attribute here, attribute, we can refer to the Token's 'is_tech_org' attribute here,
which is already set in the processing step.""" which is already set in the processing step."""
return any([t._.get('is_tech_org') for t in tokens]) return any([t._.get("is_tech_org") for t in tokens])
if __name__ == '__main__': if __name__ == "__main__":
plac.call(main) plac.call(main)
# Expected output: # Expected output:

View File

@ -1,4 +1,4 @@
'''Example of adding a pipeline component to prohibit sentence boundaries """Example of adding a pipeline component to prohibit sentence boundaries
before certain tokens. before certain tokens.
What we do is write to the token.is_sent_start attribute, which What we do is write to the token.is_sent_start attribute, which
@ -10,16 +10,18 @@ should also improve the parse quality.
The specific example here is drawn from https://github.com/explosion/spaCy/issues/2627 The specific example here is drawn from https://github.com/explosion/spaCy/issues/2627
Other versions of the model may not make the original mistake, so the specific Other versions of the model may not make the original mistake, so the specific
example might not be apt for future versions. example might not be apt for future versions.
''' """
import plac import plac
import spacy import spacy
def prevent_sentence_boundaries(doc): def prevent_sentence_boundaries(doc):
for token in doc: for token in doc:
if not can_be_sentence_start(token): if not can_be_sentence_start(token):
token.is_sent_start = False token.is_sent_start = False
return doc return doc
def can_be_sentence_start(token): def can_be_sentence_start(token):
if token.i == 0: if token.i == 0:
return True return True
@ -32,17 +34,18 @@ def can_be_sentence_start(token):
else: else:
return False return False
def main(): def main():
nlp = spacy.load('en_core_web_lg') nlp = spacy.load("en_core_web_lg")
raw_text = "Been here and I'm loving it." raw_text = "Been here and I'm loving it."
doc = nlp(raw_text) doc = nlp(raw_text)
sentences = [sent.string.strip() for sent in doc.sents] sentences = [sent.string.strip() for sent in doc.sents]
print(sentences) print(sentences)
nlp.add_pipe(prevent_sentence_boundaries, before='parser') nlp.add_pipe(prevent_sentence_boundaries, before="parser")
doc = nlp(raw_text) doc = nlp(raw_text)
sentences = [sent.string.strip() for sent in doc.sents] sentences = [sent.string.strip() for sent in doc.sents]
print(sentences) print(sentences)
if __name__ == '__main__': if __name__ == "__main__":
plac.call(main) plac.call(main)

View File

@ -1,10 +1,11 @@
'''Demonstrate adding a rule-based component that forces some tokens to not """Demonstrate adding a rule-based component that forces some tokens to not
be entities, before the NER tagger is applied. This is used to hotfix the issue be entities, before the NER tagger is applied. This is used to hotfix the issue
in https://github.com/explosion/spaCy/issues/2870 , present as of spaCy v2.0.16. in https://github.com/explosion/spaCy/issues/2870 , present as of spaCy v2.0.16.
''' """
import spacy import spacy
from spacy.attrs import ENT_IOB from spacy.attrs import ENT_IOB
def fix_space_tags(doc): def fix_space_tags(doc):
ent_iobs = doc.to_array([ENT_IOB]) ent_iobs = doc.to_array([ENT_IOB])
for i, token in enumerate(doc): for i, token in enumerate(doc):
@ -14,14 +15,16 @@ def fix_space_tags(doc):
doc.from_array([ENT_IOB], ent_iobs.reshape((len(doc), 1))) doc.from_array([ENT_IOB], ent_iobs.reshape((len(doc), 1)))
return doc return doc
def main():
nlp = spacy.load('en_core_web_sm')
text = u'''This is some crazy test where I dont need an Apple Watch to make things bug'''
doc = nlp(text)
print('Before', doc.ents)
nlp.add_pipe(fix_space_tags, name='fix-ner', before='ner')
doc = nlp(text)
print('After', doc.ents)
if __name__ == '__main__': def main():
nlp = spacy.load("en_core_web_sm")
text = u"""This is some crazy test where I dont need an Apple Watch to make things bug"""
doc = nlp(text)
print("Before", doc.ents)
nlp.add_pipe(fix_space_tags, name="fix-ner", before="ner")
doc = nlp(text)
print("After", doc.ents)
if __name__ == "__main__":
main() main()

View File

@ -9,6 +9,7 @@ built-in dataset loader.
Compatible with: spaCy v2.0.0+ Compatible with: spaCy v2.0.0+
""" """
from __future__ import print_function, unicode_literals from __future__ import print_function, unicode_literals
from toolz import partition_all from toolz import partition_all
from pathlib import Path from pathlib import Path
from joblib import Parallel, delayed from joblib import Parallel, delayed
@ -22,9 +23,9 @@ import spacy
model=("Model name (needs tagger)", "positional", None, str), model=("Model name (needs tagger)", "positional", None, str),
n_jobs=("Number of workers", "option", "n", int), n_jobs=("Number of workers", "option", "n", int),
batch_size=("Batch-size for each process", "option", "b", int), batch_size=("Batch-size for each process", "option", "b", int),
limit=("Limit of entries from the dataset", "option", "l", int)) limit=("Limit of entries from the dataset", "option", "l", int),
def main(output_dir, model='en_core_web_sm', n_jobs=4, batch_size=1000, )
limit=10000): def main(output_dir, model="en_core_web_sm", n_jobs=4, batch_size=1000, limit=10000):
nlp = spacy.load(model) # load spaCy model nlp = spacy.load(model) # load spaCy model
print("Loaded model '%s'" % model) print("Loaded model '%s'" % model)
if not output_dir.exists(): if not output_dir.exists():
@ -37,42 +38,44 @@ def main(output_dir, model='en_core_web_sm', n_jobs=4, batch_size=1000,
partitions = partition_all(batch_size, texts) partitions = partition_all(batch_size, texts)
executor = Parallel(n_jobs=n_jobs) executor = Parallel(n_jobs=n_jobs)
do = delayed(transform_texts) do = delayed(transform_texts)
tasks = (do(nlp, i, batch, output_dir) tasks = (do(nlp, i, batch, output_dir) for i, batch in enumerate(partitions))
for i, batch in enumerate(partitions))
executor(tasks) executor(tasks)
def transform_texts(nlp, batch_id, texts, output_dir): def transform_texts(nlp, batch_id, texts, output_dir):
print(nlp.pipe_names) print(nlp.pipe_names)
out_path = Path(output_dir) / ('%d.txt' % batch_id) out_path = Path(output_dir) / ("%d.txt" % batch_id)
if out_path.exists(): # return None in case same batch is called again if out_path.exists(): # return None in case same batch is called again
return None return None
print('Processing batch', batch_id) print("Processing batch", batch_id)
with out_path.open('w', encoding='utf8') as f: with out_path.open("w", encoding="utf8") as f:
for doc in nlp.pipe(texts): for doc in nlp.pipe(texts):
f.write(' '.join(represent_word(w) for w in doc if not w.is_space)) f.write(" ".join(represent_word(w) for w in doc if not w.is_space))
f.write('\n') f.write("\n")
print('Saved {} texts to {}.txt'.format(len(texts), batch_id)) print("Saved {} texts to {}.txt".format(len(texts), batch_id))
def represent_word(word): def represent_word(word):
text = word.text text = word.text
# True-case, i.e. try to normalize sentence-initial capitals. # True-case, i.e. try to normalize sentence-initial capitals.
# Only do this if the lower-cased form is more probable. # Only do this if the lower-cased form is more probable.
if text.istitle() and is_sent_begin(word) \ if (
and word.prob < word.doc.vocab[text.lower()].prob: text.istitle()
and is_sent_begin(word)
and word.prob < word.doc.vocab[text.lower()].prob
):
text = text.lower() text = text.lower()
return text + '|' + word.tag_ return text + "|" + word.tag_
def is_sent_begin(word): def is_sent_begin(word):
if word.i == 0: if word.i == 0:
return True return True
elif word.i >= 2 and word.nbor(-1).text in ('.', '!', '?', '...'): elif word.i >= 2 and word.nbor(-1).text in (".", "!", "?", "..."):
return True return True
else: else:
return False return False
if __name__ == '__main__': if __name__ == "__main__":
plac.call(main) plac.call(main)

View File

@ -1,6 +1,6 @@
'''Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes """Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
.conllu format for development data, allowing the official scorer to be used. .conllu format for development data, allowing the official scorer to be used.
''' """
from __future__ import unicode_literals from __future__ import unicode_literals
import plac import plac
import tqdm import tqdm
@ -35,6 +35,7 @@ spacy.lang.ja.Japanese.Defaults.use_janome = False
random.seed(0) random.seed(0)
numpy.random.seed(0) numpy.random.seed(0)
def minibatch_by_words(items, size=5000): def minibatch_by_words(items, size=5000):
random.shuffle(items) random.shuffle(items)
if isinstance(size, int): if isinstance(size, int):
@ -59,21 +60,31 @@ def minibatch_by_words(items, size=5000):
else: else:
break break
################ ################
# Data reading # # Data reading #
################ ################
space_re = re.compile('\s+') space_re = re.compile("\s+")
def split_text(text): def split_text(text):
return [space_re.sub(' ', par.strip()) for par in text.split('\n\n')] return [space_re.sub(" ", par.strip()) for par in text.split("\n\n")]
def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False, def read_data(
max_doc_length=None, limit=None): nlp,
'''Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True, conllu_file,
text_file,
raw_text=True,
oracle_segments=False,
max_doc_length=None,
limit=None,
):
"""Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
include Doc objects created using nlp.make_doc and then aligned against include Doc objects created using nlp.make_doc and then aligned against
the gold-standard sequences. If oracle_segments=True, include Doc objects the gold-standard sequences. If oracle_segments=True, include Doc objects
created from the gold-standard segments. At least one must be True.''' created from the gold-standard segments. At least one must be True."""
if not raw_text and not oracle_segments: if not raw_text and not oracle_segments:
raise ValueError("At least one of raw_text or oracle_segments must be True") raise ValueError("At least one of raw_text or oracle_segments must be True")
paragraphs = split_text(text_file.read()) paragraphs = split_text(text_file.read())
@ -87,22 +98,21 @@ def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False,
for cs in cd: for cs in cd:
sent = defaultdict(list) sent = defaultdict(list)
for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs: for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs:
if '.' in id_: if "." in id_:
continue continue
if '-' in id_: if "-" in id_:
continue continue
id_ = int(id_) - 1 id_ = int(id_) - 1
head = int(head)-1 if head != '0' else id_ head = int(head) - 1 if head != "0" else id_
sent['words'].append(word) sent["words"].append(word)
sent['tags'].append(tag) sent["tags"].append(tag)
sent['heads'].append(head) sent["heads"].append(head)
sent['deps'].append('ROOT' if dep == 'root' else dep) sent["deps"].append("ROOT" if dep == "root" else dep)
sent['spaces'].append(space_after == '_') sent["spaces"].append(space_after == "_")
sent['entities'] = ['-'] * len(sent['words']) sent["entities"] = ["-"] * len(sent["words"])
sent['heads'], sent['deps'] = projectivize(sent['heads'], sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"])
sent['deps'])
if oracle_segments: if oracle_segments:
docs.append(Doc(nlp.vocab, words=sent['words'], spaces=sent['spaces'])) docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
golds.append(GoldParse(docs[-1], **sent)) golds.append(GoldParse(docs[-1], **sent))
sent_annots.append(sent) sent_annots.append(sent)
@ -128,18 +138,18 @@ def read_conllu(file_):
sent = [] sent = []
doc = [] doc = []
for line in file_: for line in file_:
if line.startswith('# newdoc'): if line.startswith("# newdoc"):
if doc: if doc:
docs.append(doc) docs.append(doc)
doc = [] doc = []
elif line.startswith('#'): elif line.startswith("#"):
continue continue
elif not line.strip(): elif not line.strip():
if sent: if sent:
doc.append(sent) doc.append(sent)
sent = [] sent = []
else: else:
sent.append(list(line.strip().split('\t'))) sent.append(list(line.strip().split("\t")))
if len(sent[-1]) != 10: if len(sent[-1]) != 10:
print(repr(line)) print(repr(line))
raise ValueError raise ValueError
@ -154,25 +164,29 @@ def _make_gold(nlp, text, sent_annots):
# Flatten the conll annotations, and adjust the head indices # Flatten the conll annotations, and adjust the head indices
flat = defaultdict(list) flat = defaultdict(list)
for sent in sent_annots: for sent in sent_annots:
flat['heads'].extend(len(flat['words'])+head for head in sent['heads']) flat["heads"].extend(len(flat["words"]) + head for head in sent["heads"])
for field in ['words', 'tags', 'deps', 'entities', 'spaces']: for field in ["words", "tags", "deps", "entities", "spaces"]:
flat[field].extend(sent[field]) flat[field].extend(sent[field])
# Construct text if necessary # Construct text if necessary
assert len(flat['words']) == len(flat['spaces']) assert len(flat["words"]) == len(flat["spaces"])
if text is None: if text is None:
text = ''.join(word+' '*space for word, space in zip(flat['words'], flat['spaces'])) text = "".join(
word + " " * space for word, space in zip(flat["words"], flat["spaces"])
)
doc = nlp.make_doc(text) doc = nlp.make_doc(text)
flat.pop('spaces') flat.pop("spaces")
gold = GoldParse(doc, **flat) gold = GoldParse(doc, **flat)
return doc, gold return doc, gold
############################# #############################
# Data transforms for spaCy # # Data transforms for spaCy #
############################# #############################
def golds_to_gold_tuples(docs, golds): def golds_to_gold_tuples(docs, golds):
'''Get out the annoying 'tuples' format used by begin_training, given the """Get out the annoying 'tuples' format used by begin_training, given the
GoldParse objects.''' GoldParse objects."""
tuples = [] tuples = []
for doc, gold in zip(docs, golds): for doc, gold in zip(docs, golds):
text = doc.text text = doc.text
@ -186,15 +200,16 @@ def golds_to_gold_tuples(docs, golds):
# Evaluation # # Evaluation #
############## ##############
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None): def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
with text_loc.open('r', encoding='utf8') as text_file: with text_loc.open("r", encoding="utf8") as text_file:
texts = split_text(text_file.read()) texts = split_text(text_file.read())
docs = list(nlp.pipe(texts)) docs = list(nlp.pipe(texts))
with sys_loc.open('w', encoding='utf8') as out_file: with sys_loc.open("w", encoding="utf8") as out_file:
write_conllu(docs, out_file) write_conllu(docs, out_file)
with gold_loc.open('r', encoding='utf8') as gold_file: with gold_loc.open("r", encoding="utf8") as gold_file:
gold_ud = conll17_ud_eval.load_conllu(gold_file) gold_ud = conll17_ud_eval.load_conllu(gold_file)
with sys_loc.open('r', encoding='utf8') as sys_file: with sys_loc.open("r", encoding="utf8") as sys_file:
sys_ud = conll17_ud_eval.load_conllu(sys_file) sys_ud = conll17_ud_eval.load_conllu(sys_file)
scores = conll17_ud_eval.evaluate(gold_ud, sys_ud) scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
return scores return scores
@ -202,7 +217,7 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
def write_conllu(docs, file_): def write_conllu(docs, file_):
merger = Matcher(docs[0].vocab) merger = Matcher(docs[0].vocab)
merger.add('SUBTOK', None, [{'DEP': 'subtok', 'op': '+'}]) merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
for i, doc in enumerate(docs): for i, doc in enumerate(docs):
matches = merger(doc) matches = merger(doc)
spans = [doc[start : end + 1] for _, start, end in matches] spans = [doc[start : end + 1] for _, start, end in matches]
@ -214,58 +229,73 @@ def write_conllu(docs, file_):
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j)) file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
file_.write("# text = {text}\n".format(text=sent.text)) file_.write("# text = {text}\n".format(text=sent.text))
for k, token in enumerate(sent): for k, token in enumerate(sent):
file_.write(token._.get_conllu_lines(k) + '\n') file_.write(token._.get_conllu_lines(k) + "\n")
file_.write('\n') file_.write("\n")
def print_progress(itn, losses, ud_scores): def print_progress(itn, losses, ud_scores):
fields = { fields = {
'dep_loss': losses.get('parser', 0.0), "dep_loss": losses.get("parser", 0.0),
'tag_loss': losses.get('tagger', 0.0), "tag_loss": losses.get("tagger", 0.0),
'words': ud_scores['Words'].f1 * 100, "words": ud_scores["Words"].f1 * 100,
'sents': ud_scores['Sentences'].f1 * 100, "sents": ud_scores["Sentences"].f1 * 100,
'tags': ud_scores['XPOS'].f1 * 100, "tags": ud_scores["XPOS"].f1 * 100,
'uas': ud_scores['UAS'].f1 * 100, "uas": ud_scores["UAS"].f1 * 100,
'las': ud_scores['LAS'].f1 * 100, "las": ud_scores["LAS"].f1 * 100,
} }
header = ['Epoch', 'Loss', 'LAS', 'UAS', 'TAG', 'SENT', 'WORD'] header = ["Epoch", "Loss", "LAS", "UAS", "TAG", "SENT", "WORD"]
if itn == 0: if itn == 0:
print('\t'.join(header)) print("\t".join(header))
tpl = '\t'.join(( tpl = "\t".join(
'{:d}', (
'{dep_loss:.1f}', "{:d}",
'{las:.1f}', "{dep_loss:.1f}",
'{uas:.1f}', "{las:.1f}",
'{tags:.1f}', "{uas:.1f}",
'{sents:.1f}', "{tags:.1f}",
'{words:.1f}', "{sents:.1f}",
)) "{words:.1f}",
)
)
print(tpl.format(itn, **fields)) print(tpl.format(itn, **fields))
# def get_sent_conllu(sent, sent_id): # def get_sent_conllu(sent, sent_id):
# lines = ["# sent_id = {sent_id}".format(sent_id=sent_id)] # lines = ["# sent_id = {sent_id}".format(sent_id=sent_id)]
def get_token_conllu(token, i): def get_token_conllu(token, i):
if token._.begins_fused: if token._.begins_fused:
n = 1 n = 1
while token.nbor(n)._.inside_fused: while token.nbor(n)._.inside_fused:
n += 1 n += 1
id_ = '%d-%d' % (i, i+n) id_ = "%d-%d" % (i, i + n)
lines = [id_, token.text, '_', '_', '_', '_', '_', '_', '_', '_'] lines = [id_, token.text, "_", "_", "_", "_", "_", "_", "_", "_"]
else: else:
lines = [] lines = []
if token.head.i == token.i: if token.head.i == token.i:
head = 0 head = 0
else: else:
head = i + (token.head.i - token.i) + 1 head = i + (token.head.i - token.i) + 1
fields = [str(i+1), token.text, token.lemma_, token.pos_, token.tag_, '_', fields = [
str(head), token.dep_.lower(), '_', '_'] str(i + 1),
lines.append('\t'.join(fields)) token.text,
return '\n'.join(lines) token.lemma_,
token.pos_,
token.tag_,
"_",
str(head),
token.dep_.lower(),
"_",
"_",
]
lines.append("\t".join(fields))
return "\n".join(lines)
Token.set_extension('get_conllu_lines', method=get_token_conllu)
Token.set_extension('begins_fused', default=False) Token.set_extension("get_conllu_lines", method=get_token_conllu)
Token.set_extension('inside_fused', default=False) Token.set_extension("begins_fused", default=False)
Token.set_extension("inside_fused", default=False)
################## ##################
@ -274,31 +304,32 @@ Token.set_extension('inside_fused', default=False)
def load_nlp(corpus, config): def load_nlp(corpus, config):
lang = corpus.split('_')[0] lang = corpus.split("_")[0]
nlp = spacy.blank(lang) nlp = spacy.blank(lang)
if config.vectors: if config.vectors:
nlp.vocab.from_disk(config.vectors / 'vocab') nlp.vocab.from_disk(config.vectors / "vocab")
return nlp return nlp
def initialize_pipeline(nlp, docs, golds, config): def initialize_pipeline(nlp, docs, golds, config):
nlp.add_pipe(nlp.create_pipe('parser')) nlp.add_pipe(nlp.create_pipe("parser"))
if config.multitask_tag: if config.multitask_tag:
nlp.parser.add_multitask_objective('tag') nlp.parser.add_multitask_objective("tag")
if config.multitask_sent: if config.multitask_sent:
nlp.parser.add_multitask_objective('sent_start') nlp.parser.add_multitask_objective("sent_start")
nlp.parser.moves.add_action(2, 'subtok') nlp.parser.moves.add_action(2, "subtok")
nlp.add_pipe(nlp.create_pipe('tagger')) nlp.add_pipe(nlp.create_pipe("tagger"))
for gold in golds: for gold in golds:
for tag in gold.tags: for tag in gold.tags:
if tag is not None: if tag is not None:
nlp.tagger.add_label(tag) nlp.tagger.add_label(tag)
# Replace labels that didn't make the frequency cutoff # Replace labels that didn't make the frequency cutoff
actions = set(nlp.parser.labels) actions = set(nlp.parser.labels)
label_set = set([act.split('-')[1] for act in actions if '-' in act]) label_set = set([act.split("-")[1] for act in actions if "-" in act])
for gold in golds: for gold in golds:
for i, label in enumerate(gold.labels): for i, label in enumerate(gold.labels):
if label is not None and label not in label_set: if label is not None and label not in label_set:
gold.labels[i] = label.split('||')[0] gold.labels[i] = label.split("||")[0]
return nlp.begin_training(lambda: golds_to_gold_tuples(docs, golds)) return nlp.begin_training(lambda: golds_to_gold_tuples(docs, golds))
@ -306,6 +337,7 @@ def initialize_pipeline(nlp, docs, golds, config):
# Command line helpers # # Command line helpers #
######################## ########################
@attr.s @attr.s
class Config(object): class Config(object):
vectors = attr.ib(default=None) vectors = attr.ib(default=None)
@ -318,7 +350,7 @@ class Config(object):
@classmethod @classmethod
def load(cls, loc): def load(cls, loc):
with Path(loc).open('r', encoding='utf8') as file_: with Path(loc).open("r", encoding="utf8") as file_:
cfg = json.load(file_) cfg = json.load(file_)
return cls(**cfg) return cls(**cfg)
@ -331,32 +363,36 @@ class Dataset(object):
self.text = None self.text = None
for file_path in self.path.iterdir(): for file_path in self.path.iterdir():
name = file_path.parts[-1] name = file_path.parts[-1]
if section in name and name.endswith('conllu'): if section in name and name.endswith("conllu"):
self.conllu = file_path self.conllu = file_path
elif section in name and name.endswith('txt'): elif section in name and name.endswith("txt"):
self.text = file_path self.text = file_path
if self.conllu is None: if self.conllu is None:
msg = "Could not find .txt file in {path} for {section}" msg = "Could not find .txt file in {path} for {section}"
raise IOError(msg.format(section=section, path=path)) raise IOError(msg.format(section=section, path=path))
if self.text is None: if self.text is None:
msg = "Could not find .txt file in {path} for {section}" msg = "Could not find .txt file in {path} for {section}"
self.lang = self.conllu.parts[-1].split('-')[0].split('_')[0] self.lang = self.conllu.parts[-1].split("-")[0].split("_")[0]
class TreebankPaths(object): class TreebankPaths(object):
def __init__(self, ud_path, treebank, **cfg): def __init__(self, ud_path, treebank, **cfg):
self.train = Dataset(ud_path / treebank, 'train') self.train = Dataset(ud_path / treebank, "train")
self.dev = Dataset(ud_path / treebank, 'dev') self.dev = Dataset(ud_path / treebank, "dev")
self.lang = self.train.lang self.lang = self.train.lang
@plac.annotations( @plac.annotations(
ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path), ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
corpus=("UD corpus to train and evaluate on, e.g. en, es_ancora, etc", corpus=(
"positional", None, str), "UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
"positional",
None,
str,
),
parses_dir=("Directory to write the development parses", "positional", None, Path), parses_dir=("Directory to write the development parses", "positional", None, Path),
config=("Path to json formatted config file", "positional", None, Config.load), config=("Path to json formatted config file", "positional", None, Config.load),
limit=("Size limit", "option", "n", int) limit=("Size limit", "option", "n", int),
) )
def main(ud_dir, parses_dir, config, corpus, limit=0): def main(ud_dir, parses_dir, config, corpus, limit=0):
paths = TreebankPaths(ud_dir, corpus) paths = TreebankPaths(ud_dir, corpus)
@ -365,8 +401,13 @@ def main(ud_dir, parses_dir, config, corpus, limit=0):
print("Train and evaluate", corpus, "using lang", paths.lang) print("Train and evaluate", corpus, "using lang", paths.lang)
nlp = load_nlp(paths.lang, config) nlp = load_nlp(paths.lang, config)
docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(), docs, golds = read_data(
max_doc_length=config.max_doc_length, limit=limit) nlp,
paths.train.conllu.open(),
paths.train.text.open(),
max_doc_length=config.max_doc_length,
limit=limit,
)
optimizer = initialize_pipeline(nlp, docs, golds, config) optimizer = initialize_pipeline(nlp, docs, golds, config)
@ -379,14 +420,19 @@ def main(ud_dir, parses_dir, config, corpus, limit=0):
for batch in batches: for batch in batches:
batch_docs, batch_gold = zip(*batch) batch_docs, batch_gold = zip(*batch)
pbar.update(sum(len(doc) for doc in batch_docs)) pbar.update(sum(len(doc) for doc in batch_docs))
nlp.update(batch_docs, batch_gold, sgd=optimizer, nlp.update(
drop=config.dropout, losses=losses) batch_docs,
batch_gold,
sgd=optimizer,
drop=config.dropout,
losses=losses,
)
out_path = parses_dir / corpus / 'epoch-{i}.conllu'.format(i=i) out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i)
with nlp.use_params(optimizer.averages): with nlp.use_params(optimizer.averages):
scores = evaluate(nlp, paths.dev.text, paths.dev.conllu, out_path) scores = evaluate(nlp, paths.dev.text, paths.dev.conllu, out_path)
print_progress(i, losses, scores) print_progress(i, losses, scores)
if __name__ == '__main__': if __name__ == "__main__":
plac.call(main) plac.call(main)

View File

@ -1,4 +1,4 @@
'''This example shows how to add a multi-task objective that is trained """This example shows how to add a multi-task objective that is trained
alongside the entity recognizer. This is an alternative to adding features alongside the entity recognizer. This is an alternative to adding features
to the model. to the model.
@ -19,7 +19,7 @@ The specific example here is not necessarily a good idea --- but it shows
how an arbitrary objective function for some word can be used. how an arbitrary objective function for some word can be used.
Developed and tested for spaCy 2.0.6 Developed and tested for spaCy 2.0.6
''' """
import random import random
import plac import plac
import spacy import spacy
@ -30,30 +30,29 @@ random.seed(0)
PWD = os.path.dirname(__file__) PWD = os.path.dirname(__file__)
TRAIN_DATA = list(read_json_file(os.path.join(PWD, 'training-data.json'))) TRAIN_DATA = list(read_json_file(os.path.join(PWD, "training-data.json")))
def get_position_label(i, words, tags, heads, labels, ents): def get_position_label(i, words, tags, heads, labels, ents):
'''Return labels indicating the position of the word in the document. """Return labels indicating the position of the word in the document.
''' """
if len(words) < 20: if len(words) < 20:
return 'short-doc' return "short-doc"
elif i == 0: elif i == 0:
return 'first-word' return "first-word"
elif i < 10: elif i < 10:
return 'early-word' return "early-word"
elif i < 20: elif i < 20:
return 'mid-word' return "mid-word"
elif i == len(words) - 1: elif i == len(words) - 1:
return 'last-word' return "last-word"
else: else:
return 'late-word' return "late-word"
def main(n_iter=10): def main(n_iter=10):
nlp = spacy.blank('en') nlp = spacy.blank("en")
ner = nlp.create_pipe('ner') ner = nlp.create_pipe("ner")
ner.add_multitask_objective(get_position_label) ner.add_multitask_objective(get_position_label)
nlp.add_pipe(ner) nlp.add_pipe(ner)
@ -71,15 +70,16 @@ def main(n_iter=10):
[gold], # batch of annotations [gold], # batch of annotations
drop=0.2, # dropout - make it harder to memorise data drop=0.2, # dropout - make it harder to memorise data
sgd=optimizer, # callable to update weights sgd=optimizer, # callable to update weights
losses=losses) losses=losses,
print(losses.get('nn_labeller', 0.0), losses['ner']) )
print(losses.get("nn_labeller", 0.0), losses["ner"])
# test the trained model # test the trained model
for text, _ in TRAIN_DATA: for text, _ in TRAIN_DATA:
doc = nlp(text) doc = nlp(text)
print('Entities', [(ent.text, ent.label_) for ent in doc.ents]) print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc]) print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
if __name__ == '__main__': if __name__ == "__main__":
plac.call(main) plac.call(main)

View File

@ -1,4 +1,4 @@
'''This script is experimental. """This script is experimental.
Try pre-training the CNN component of the text categorizer using a cheap Try pre-training the CNN component of the text categorizer using a cheap
language modelling-like objective. Specifically, we load pre-trained vectors language modelling-like objective. Specifically, we load pre-trained vectors
@ -12,7 +12,7 @@ To evaluate the technique, we're pre-training with the 50k texts from the IMDB
corpus, and then training with only 100 labels. Note that it's a bit dirty to corpus, and then training with only 100 labels. Note that it's a bit dirty to
pre-train with the development data, but also not *so* terrible: we're not using pre-train with the development data, but also not *so* terrible: we're not using
the development labels, after all --- only the unlabelled text. the development labels, after all --- only the unlabelled text.
''' """
import plac import plac
import random import random
import spacy import spacy
@ -46,8 +46,8 @@ def load_textcat_data(limit=0):
train_data = train_data[-limit:] train_data = train_data[-limit:]
texts, labels = zip(*train_data) texts, labels = zip(*train_data)
eval_texts, eval_labels = zip(*eval_data) eval_texts, eval_labels = zip(*eval_data)
cats = [{'POSITIVE': bool(y), 'NEGATIVE': not bool(y)} for y in labels] cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
eval_cats = [{'POSITIVE': bool(y), 'NEGATIVE': not bool(y)} for y in eval_labels] eval_cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in eval_labels]
return (texts, cats), (eval_texts, eval_cats) return (texts, cats), (eval_texts, eval_cats)
@ -57,6 +57,7 @@ def prefer_gpu():
return False return False
else: else:
import cupy.random import cupy.random
cupy.random.seed(0) cupy.random.seed(0)
return True return True
@ -68,7 +69,7 @@ def build_textcat_model(tok2vec, nr_class, width):
from thinc.misc import Residual, LayerNorm from thinc.misc import Residual, LayerNorm
from spacy._ml import logistic, zero_init from spacy._ml import logistic, zero_init
with Model.define_operators({'>>': chain}): with Model.define_operators({">>": chain}):
model = ( model = (
tok2vec tok2vec
>> flatten_add_lengths >> flatten_add_lengths
@ -78,27 +79,35 @@ def build_textcat_model(tok2vec, nr_class, width):
model.tok2vec = tok2vec model.tok2vec = tok2vec
return model return model
def block_gradients(model): def block_gradients(model):
from thinc.api import wrap from thinc.api import wrap
def forward(X, drop=0.):
def forward(X, drop=0.0):
Y, _ = model.begin_update(X, drop=drop) Y, _ = model.begin_update(X, drop=drop)
return Y, None return Y, None
return wrap(forward, model) return wrap(forward, model)
def create_pipeline(width, embed_size, vectors_model): def create_pipeline(width, embed_size, vectors_model):
print("Load vectors") print("Load vectors")
nlp = spacy.load(vectors_model) nlp = spacy.load(vectors_model)
print("Start training") print("Start training")
textcat = TextCategorizer(nlp.vocab, textcat = TextCategorizer(
labels=['POSITIVE', 'NEGATIVE'], nlp.vocab,
labels=["POSITIVE", "NEGATIVE"],
model=build_textcat_model( model=build_textcat_model(
Tok2Vec(width=width, embed_size=embed_size), 2, width)) Tok2Vec(width=width, embed_size=embed_size), 2, width
),
)
nlp.add_pipe(textcat) nlp.add_pipe(textcat)
return nlp return nlp
def train_tensorizer(nlp, texts, dropout, n_iter): def train_tensorizer(nlp, texts, dropout, n_iter):
tensorizer = nlp.create_pipe('tensorizer') tensorizer = nlp.create_pipe("tensorizer")
nlp.add_pipe(tensorizer) nlp.add_pipe(tensorizer)
optimizer = nlp.begin_training() optimizer = nlp.begin_training()
for i in range(n_iter): for i in range(n_iter):
@ -109,36 +118,43 @@ def train_tensorizer(nlp, texts, dropout, n_iter):
print(losses) print(losses)
return optimizer return optimizer
def train_textcat(nlp, n_texts, n_iter=10): def train_textcat(nlp, n_texts, n_iter=10):
textcat = nlp.get_pipe('textcat') textcat = nlp.get_pipe("textcat")
tok2vec_weights = textcat.model.tok2vec.to_bytes() tok2vec_weights = textcat.model.tok2vec.to_bytes()
(train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts) (train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts)
print("Using {} examples ({} training, {} evaluation)" print(
.format(n_texts, len(train_texts), len(dev_texts))) "Using {} examples ({} training, {} evaluation)".format(
train_data = list(zip(train_texts, n_texts, len(train_texts), len(dev_texts)
[{'cats': cats} for cats in train_cats])) )
)
train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
# get names of other pipes to disable them during training # get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat'] other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
with nlp.disable_pipes(*other_pipes): # only train textcat with nlp.disable_pipes(*other_pipes): # only train textcat
optimizer = nlp.begin_training() optimizer = nlp.begin_training()
textcat.model.tok2vec.from_bytes(tok2vec_weights) textcat.model.tok2vec.from_bytes(tok2vec_weights)
print("Training the model...") print("Training the model...")
print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F')) print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
for i in range(n_iter): for i in range(n_iter):
losses = {'textcat': 0.0} losses = {"textcat": 0.0}
# batch up the examples using spaCy's minibatch # batch up the examples using spaCy's minibatch
batches = minibatch(tqdm.tqdm(train_data), size=2) batches = minibatch(tqdm.tqdm(train_data), size=2)
for batch in batches: for batch in batches:
texts, annotations = zip(*batch) texts, annotations = zip(*batch)
nlp.update(texts, annotations, sgd=optimizer, drop=0.2, nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
losses=losses)
with textcat.model.use_params(optimizer.averages): with textcat.model.use_params(optimizer.averages):
# evaluate on the dev data split off in load_data() # evaluate on the dev data split off in load_data()
scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats) scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats)
print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}' # print a simple table print(
.format(losses['textcat'], scores['textcat_p'], "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format( # print a simple table
scores['textcat_r'], scores['textcat_f'])) losses["textcat"],
scores["textcat_p"],
scores["textcat_r"],
scores["textcat_f"],
)
)
def evaluate_textcat(tokenizer, textcat, texts, cats): def evaluate_textcat(tokenizer, textcat, texts, cats):
@ -153,9 +169,9 @@ def evaluate_textcat(tokenizer, textcat, texts, cats):
if label not in gold: if label not in gold:
continue continue
if score >= 0.5 and gold[label] >= 0.5: if score >= 0.5 and gold[label] >= 0.5:
tp += 1. tp += 1.0
elif score >= 0.5 and gold[label] < 0.5: elif score >= 0.5 and gold[label] < 0.5:
fp += 1. fp += 1.0
elif score < 0.5 and gold[label] < 0.5: elif score < 0.5 and gold[label] < 0.5:
tn += 1 tn += 1
elif score < 0.5 and gold[label] >= 0.5: elif score < 0.5 and gold[label] >= 0.5:
@ -163,8 +179,7 @@ def evaluate_textcat(tokenizer, textcat, texts, cats):
precision = tp / (tp + fp) precision = tp / (tp + fp)
recall = tp / (tp + fn) recall = tp / (tp + fn)
f_score = 2 * (precision * recall) / (precision + recall) f_score = 2 * (precision * recall) / (precision + recall)
return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score} return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}
@plac.annotations( @plac.annotations(
@ -173,10 +188,16 @@ def evaluate_textcat(tokenizer, textcat, texts, cats):
pretrain_iters=("Number of iterations to pretrain", "option", "pn", int), pretrain_iters=("Number of iterations to pretrain", "option", "pn", int),
train_iters=("Number of iterations to pretrain", "option", "tn", int), train_iters=("Number of iterations to pretrain", "option", "tn", int),
train_examples=("Number of labelled examples", "option", "eg", int), train_examples=("Number of labelled examples", "option", "eg", int),
vectors_model=("Name or path to vectors model to learn from") vectors_model=("Name or path to vectors model to learn from"),
) )
def main(width, embed_size, vectors_model, def main(
pretrain_iters=30, train_iters=30, train_examples=1000): width,
embed_size,
vectors_model,
pretrain_iters=30,
train_iters=30,
train_examples=1000,
):
random.seed(0) random.seed(0)
numpy.random.seed(0) numpy.random.seed(0)
use_gpu = prefer_gpu() use_gpu = prefer_gpu()
@ -190,5 +211,6 @@ def main(width, embed_size, vectors_model,
print("Train textcat") print("Train textcat")
train_textcat(nlp, train_examples, n_iter=train_iters) train_textcat(nlp, train_examples, n_iter=train_iters)
if __name__ == '__main__':
if __name__ == "__main__":
plac.call(main) plac.call(main)

View File

@ -29,73 +29,113 @@ from spacy.util import minibatch, compounding
# training data: texts, heads and dependency labels # training data: texts, heads and dependency labels
# for no relation, we simply chose an arbitrary dependency label, e.g. '-' # for no relation, we simply chose an arbitrary dependency label, e.g. '-'
TRAIN_DATA = [ TRAIN_DATA = [
("find a cafe with great wifi", { (
'heads': [0, 2, 0, 5, 5, 2], # index of token head "find a cafe with great wifi",
'deps': ['ROOT', '-', 'PLACE', '-', 'QUALITY', 'ATTRIBUTE'] {
}), "heads": [0, 2, 0, 5, 5, 2], # index of token head
("find a hotel near the beach", { "deps": ["ROOT", "-", "PLACE", "-", "QUALITY", "ATTRIBUTE"],
'heads': [0, 2, 0, 5, 5, 2], },
'deps': ['ROOT', '-', 'PLACE', 'QUALITY', '-', 'ATTRIBUTE'] ),
}), (
("find me the closest gym that's open late", { "find a hotel near the beach",
'heads': [0, 0, 4, 4, 0, 6, 4, 6, 6], {
'deps': ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'ATTRIBUTE', 'TIME'] "heads": [0, 2, 0, 5, 5, 2],
}), "deps": ["ROOT", "-", "PLACE", "QUALITY", "-", "ATTRIBUTE"],
("show me the cheapest store that sells flowers", { },
'heads': [0, 0, 4, 4, 0, 4, 4, 4], # attach "flowers" to store! ),
'deps': ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'PRODUCT'] (
}), "find me the closest gym that's open late",
("find a nice restaurant in london", { {
'heads': [0, 3, 3, 0, 3, 3], "heads": [0, 0, 4, 4, 0, 6, 4, 6, 6],
'deps': ['ROOT', '-', 'QUALITY', 'PLACE', '-', 'LOCATION'] "deps": [
}), "ROOT",
("show me the coolest hostel in berlin", { "-",
'heads': [0, 0, 4, 4, 0, 4, 4], "-",
'deps': ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', 'LOCATION'] "QUALITY",
}), "PLACE",
("find a good italian restaurant near work", { "-",
'heads': [0, 4, 4, 4, 0, 4, 5], "-",
'deps': ['ROOT', '-', 'QUALITY', 'ATTRIBUTE', 'PLACE', 'ATTRIBUTE', 'LOCATION'] "ATTRIBUTE",
}) "TIME",
],
},
),
(
"show me the cheapest store that sells flowers",
{
"heads": [0, 0, 4, 4, 0, 4, 4, 4], # attach "flowers" to store!
"deps": ["ROOT", "-", "-", "QUALITY", "PLACE", "-", "-", "PRODUCT"],
},
),
(
"find a nice restaurant in london",
{
"heads": [0, 3, 3, 0, 3, 3],
"deps": ["ROOT", "-", "QUALITY", "PLACE", "-", "LOCATION"],
},
),
(
"show me the coolest hostel in berlin",
{
"heads": [0, 0, 4, 4, 0, 4, 4],
"deps": ["ROOT", "-", "-", "QUALITY", "PLACE", "-", "LOCATION"],
},
),
(
"find a good italian restaurant near work",
{
"heads": [0, 4, 4, 4, 0, 4, 5],
"deps": [
"ROOT",
"-",
"QUALITY",
"ATTRIBUTE",
"PLACE",
"ATTRIBUTE",
"LOCATION",
],
},
),
] ]
@plac.annotations( @plac.annotations(
model=("Model name. Defaults to blank 'en' model.", "option", "m", str), model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
output_dir=("Optional output directory", "option", "o", Path), output_dir=("Optional output directory", "option", "o", Path),
n_iter=("Number of training iterations", "option", "n", int)) n_iter=("Number of training iterations", "option", "n", int),
)
def main(model=None, output_dir=None, n_iter=15): def main(model=None, output_dir=None, n_iter=15):
"""Load the model, set up the pipeline and train the parser.""" """Load the model, set up the pipeline and train the parser."""
if model is not None: if model is not None:
nlp = spacy.load(model) # load existing spaCy model nlp = spacy.load(model) # load existing spaCy model
print("Loaded model '%s'" % model) print("Loaded model '%s'" % model)
else: else:
nlp = spacy.blank('en') # create blank Language class nlp = spacy.blank("en") # create blank Language class
print("Created blank 'en' model") print("Created blank 'en' model")
# We'll use the built-in dependency parser class, but we want to create a # We'll use the built-in dependency parser class, but we want to create a
# fresh instance just in case. # fresh instance just in case.
if 'parser' in nlp.pipe_names: if "parser" in nlp.pipe_names:
nlp.remove_pipe('parser') nlp.remove_pipe("parser")
parser = nlp.create_pipe('parser') parser = nlp.create_pipe("parser")
nlp.add_pipe(parser, first=True) nlp.add_pipe(parser, first=True)
for text, annotations in TRAIN_DATA: for text, annotations in TRAIN_DATA:
for dep in annotations.get('deps', []): for dep in annotations.get("deps", []):
parser.add_label(dep) parser.add_label(dep)
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser'] other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "parser"]
with nlp.disable_pipes(*other_pipes): # only train parser with nlp.disable_pipes(*other_pipes): # only train parser
optimizer = nlp.begin_training() optimizer = nlp.begin_training()
for itn in range(n_iter): for itn in range(n_iter):
random.shuffle(TRAIN_DATA) random.shuffle(TRAIN_DATA)
losses = {} losses = {}
# batch up the examples using spaCy's minibatch # batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001)) batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches: for batch in batches:
texts, annotations = zip(*batch) texts, annotations = zip(*batch)
nlp.update(texts, annotations, sgd=optimizer, losses=losses) nlp.update(texts, annotations, sgd=optimizer, losses=losses)
print('Losses', losses) print("Losses", losses)
# test the trained model # test the trained model
test_model(nlp) test_model(nlp)
@ -115,16 +155,18 @@ def main(model=None, output_dir=None, n_iter=15):
def test_model(nlp): def test_model(nlp):
texts = ["find a hotel with good wifi", texts = [
"find a hotel with good wifi",
"find me the cheapest gym near work", "find me the cheapest gym near work",
"show me the best hotel in berlin"] "show me the best hotel in berlin",
]
docs = nlp.pipe(texts) docs = nlp.pipe(texts)
for doc in docs: for doc in docs:
print(doc.text) print(doc.text)
print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != '-']) print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != "-"])
if __name__ == '__main__': if __name__ == "__main__":
plac.call(main) plac.call(main)
# Expected output: # Expected output:

View File

@ -20,51 +20,48 @@ from spacy.util import minibatch, compounding
# training data # training data
TRAIN_DATA = [ TRAIN_DATA = [
('Who is Shaka Khan?', { ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
'entities': [(7, 17, 'PERSON')] ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
}),
('I like London and Berlin.', {
'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]
})
] ]
@plac.annotations( @plac.annotations(
model=("Model name. Defaults to blank 'en' model.", "option", "m", str), model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
output_dir=("Optional output directory", "option", "o", Path), output_dir=("Optional output directory", "option", "o", Path),
n_iter=("Number of training iterations", "option", "n", int)) n_iter=("Number of training iterations", "option", "n", int),
)
def main(model=None, output_dir=None, n_iter=100): def main(model=None, output_dir=None, n_iter=100):
"""Load the model, set up the pipeline and train the entity recognizer.""" """Load the model, set up the pipeline and train the entity recognizer."""
if model is not None: if model is not None:
nlp = spacy.load(model) # load existing spaCy model nlp = spacy.load(model) # load existing spaCy model
print("Loaded model '%s'" % model) print("Loaded model '%s'" % model)
else: else:
nlp = spacy.blank('en') # create blank Language class nlp = spacy.blank("en") # create blank Language class
print("Created blank 'en' model") print("Created blank 'en' model")
# create the built-in pipeline components and add them to the pipeline # create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy # nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names: if "ner" not in nlp.pipe_names:
ner = nlp.create_pipe('ner') ner = nlp.create_pipe("ner")
nlp.add_pipe(ner, last=True) nlp.add_pipe(ner, last=True)
# otherwise, get it so we can add labels # otherwise, get it so we can add labels
else: else:
ner = nlp.get_pipe('ner') ner = nlp.get_pipe("ner")
# add labels # add labels
for _, annotations in TRAIN_DATA: for _, annotations in TRAIN_DATA:
for ent in annotations.get('entities'): for ent in annotations.get("entities"):
ner.add_label(ent[2]) ner.add_label(ent[2])
# get names of other pipes to disable them during training # get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes): # only train NER with nlp.disable_pipes(*other_pipes): # only train NER
optimizer = nlp.begin_training() optimizer = nlp.begin_training()
for itn in range(n_iter): for itn in range(n_iter):
random.shuffle(TRAIN_DATA) random.shuffle(TRAIN_DATA)
losses = {} losses = {}
# batch up the examples using spaCy's minibatch # batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001)) batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches: for batch in batches:
texts, annotations = zip(*batch) texts, annotations = zip(*batch)
nlp.update( nlp.update(
@ -72,14 +69,15 @@ def main(model=None, output_dir=None, n_iter=100):
annotations, # batch of annotations annotations, # batch of annotations
drop=0.5, # dropout - make it harder to memorise data drop=0.5, # dropout - make it harder to memorise data
sgd=optimizer, # callable to update weights sgd=optimizer, # callable to update weights
losses=losses) losses=losses,
print('Losses', losses) )
print("Losses", losses)
# test the trained model # test the trained model
for text, _ in TRAIN_DATA: for text, _ in TRAIN_DATA:
doc = nlp(text) doc = nlp(text)
print('Entities', [(ent.text, ent.label_) for ent in doc.ents]) print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc]) print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
# save model to output directory # save model to output directory
if output_dir is not None: if output_dir is not None:
@ -94,11 +92,11 @@ def main(model=None, output_dir=None, n_iter=100):
nlp2 = spacy.load(output_dir) nlp2 = spacy.load(output_dir)
for text, _ in TRAIN_DATA: for text, _ in TRAIN_DATA:
doc = nlp2(text) doc = nlp2(text)
print('Entities', [(ent.text, ent.label_) for ent in doc.ents]) print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc]) print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
if __name__ == '__main__': if __name__ == "__main__":
plac.call(main) plac.call(main)
# Expected output: # Expected output:

View File

@ -35,7 +35,7 @@ from spacy.util import minibatch, compounding
# new entity label # new entity label
LABEL = 'ANIMAL' LABEL = "ANIMAL"
# training data # training data
# Note: If you're using an existing model, make sure to mix in examples of # Note: If you're using an existing model, make sure to mix in examples of
@ -43,29 +43,21 @@ LABEL = 'ANIMAL'
# model might learn the new type, but "forget" what it previously knew. # model might learn the new type, but "forget" what it previously knew.
# https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting # https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
TRAIN_DATA = [ TRAIN_DATA = [
("Horses are too tall and they pretend to care about your feelings", { (
'entities': [(0, 6, 'ANIMAL')] "Horses are too tall and they pretend to care about your feelings",
}), {"entities": [(0, 6, "ANIMAL")]},
),
("Do they bite?", { ("Do they bite?", {"entities": []}),
'entities': [] (
}), "horses are too tall and they pretend to care about your feelings",
{"entities": [(0, 6, "ANIMAL")]},
("horses are too tall and they pretend to care about your feelings", { ),
'entities': [(0, 6, 'ANIMAL')] ("horses pretend to care about your feelings", {"entities": [(0, 6, "ANIMAL")]}),
}), (
"they pretend to care about your feelings, those horses",
("horses pretend to care about your feelings", { {"entities": [(48, 54, "ANIMAL")]},
'entities': [(0, 6, 'ANIMAL')] ),
}), ("horses?", {"entities": [(0, 6, "ANIMAL")]}),
("they pretend to care about your feelings, those horses", {
'entities': [(48, 54, 'ANIMAL')]
}),
("horses?", {
'entities': [(0, 6, 'ANIMAL')]
})
] ]
@ -73,23 +65,24 @@ TRAIN_DATA = [
model=("Model name. Defaults to blank 'en' model.", "option", "m", str), model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
new_model_name=("New model name for model meta.", "option", "nm", str), new_model_name=("New model name for model meta.", "option", "nm", str),
output_dir=("Optional output directory", "option", "o", Path), output_dir=("Optional output directory", "option", "o", Path),
n_iter=("Number of training iterations", "option", "n", int)) n_iter=("Number of training iterations", "option", "n", int),
def main(model=None, new_model_name='animal', output_dir=None, n_iter=10): )
def main(model=None, new_model_name="animal", output_dir=None, n_iter=10):
"""Set up the pipeline and entity recognizer, and train the new entity.""" """Set up the pipeline and entity recognizer, and train the new entity."""
if model is not None: if model is not None:
nlp = spacy.load(model) # load existing spaCy model nlp = spacy.load(model) # load existing spaCy model
print("Loaded model '%s'" % model) print("Loaded model '%s'" % model)
else: else:
nlp = spacy.blank('en') # create blank Language class nlp = spacy.blank("en") # create blank Language class
print("Created blank 'en' model") print("Created blank 'en' model")
# Add entity recognizer to model if it's not in the pipeline # Add entity recognizer to model if it's not in the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy # nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names: if "ner" not in nlp.pipe_names:
ner = nlp.create_pipe('ner') ner = nlp.create_pipe("ner")
nlp.add_pipe(ner) nlp.add_pipe(ner)
# otherwise, get it, so we can add labels to it # otherwise, get it, so we can add labels to it
else: else:
ner = nlp.get_pipe('ner') ner = nlp.get_pipe("ner")
ner.add_label(LABEL) # add new entity label to entity recognizer ner.add_label(LABEL) # add new entity label to entity recognizer
if model is None: if model is None:
@ -100,21 +93,20 @@ def main(model=None, new_model_name='animal', output_dir=None, n_iter=10):
optimizer = nlp.entity.create_optimizer() optimizer = nlp.entity.create_optimizer()
# get names of other pipes to disable them during training # get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes): # only train NER with nlp.disable_pipes(*other_pipes): # only train NER
for itn in range(n_iter): for itn in range(n_iter):
random.shuffle(TRAIN_DATA) random.shuffle(TRAIN_DATA)
losses = {} losses = {}
# batch up the examples using spaCy's minibatch # batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001)) batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches: for batch in batches:
texts, annotations = zip(*batch) texts, annotations = zip(*batch)
nlp.update(texts, annotations, sgd=optimizer, drop=0.35, nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
losses=losses) print("Losses", losses)
print('Losses', losses)
# test the trained model # test the trained model
test_text = 'Do you like horses?' test_text = "Do you like horses?"
doc = nlp(test_text) doc = nlp(test_text)
print("Entities in '%s'" % test_text) print("Entities in '%s'" % test_text)
for ent in doc.ents: for ent in doc.ents:
@ -125,7 +117,7 @@ def main(model=None, new_model_name='animal', output_dir=None, n_iter=10):
output_dir = Path(output_dir) output_dir = Path(output_dir)
if not output_dir.exists(): if not output_dir.exists():
output_dir.mkdir() output_dir.mkdir()
nlp.meta['name'] = new_model_name # rename model nlp.meta["name"] = new_model_name # rename model
nlp.to_disk(output_dir) nlp.to_disk(output_dir)
print("Saved model to", output_dir) print("Saved model to", output_dir)
@ -137,5 +129,5 @@ def main(model=None, new_model_name='animal', output_dir=None, n_iter=10):
print(ent.label_, ent.text) print(ent.label_, ent.text)
if __name__ == '__main__': if __name__ == "__main__":
plac.call(main) plac.call(main)

View File

@ -18,62 +18,69 @@ from spacy.util import minibatch, compounding
# training data # training data
TRAIN_DATA = [ TRAIN_DATA = [
("They trade mortgage-backed securities.", { (
'heads': [1, 1, 4, 4, 5, 1, 1], "They trade mortgage-backed securities.",
'deps': ['nsubj', 'ROOT', 'compound', 'punct', 'nmod', 'dobj', 'punct'] {
}), "heads": [1, 1, 4, 4, 5, 1, 1],
("I like London and Berlin.", { "deps": ["nsubj", "ROOT", "compound", "punct", "nmod", "dobj", "punct"],
'heads': [1, 1, 1, 2, 2, 1], },
'deps': ['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct'] ),
}) (
"I like London and Berlin.",
{
"heads": [1, 1, 1, 2, 2, 1],
"deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],
},
),
] ]
@plac.annotations( @plac.annotations(
model=("Model name. Defaults to blank 'en' model.", "option", "m", str), model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
output_dir=("Optional output directory", "option", "o", Path), output_dir=("Optional output directory", "option", "o", Path),
n_iter=("Number of training iterations", "option", "n", int)) n_iter=("Number of training iterations", "option", "n", int),
)
def main(model=None, output_dir=None, n_iter=10): def main(model=None, output_dir=None, n_iter=10):
"""Load the model, set up the pipeline and train the parser.""" """Load the model, set up the pipeline and train the parser."""
if model is not None: if model is not None:
nlp = spacy.load(model) # load existing spaCy model nlp = spacy.load(model) # load existing spaCy model
print("Loaded model '%s'" % model) print("Loaded model '%s'" % model)
else: else:
nlp = spacy.blank('en') # create blank Language class nlp = spacy.blank("en") # create blank Language class
print("Created blank 'en' model") print("Created blank 'en' model")
# add the parser to the pipeline if it doesn't exist # add the parser to the pipeline if it doesn't exist
# nlp.create_pipe works for built-ins that are registered with spaCy # nlp.create_pipe works for built-ins that are registered with spaCy
if 'parser' not in nlp.pipe_names: if "parser" not in nlp.pipe_names:
parser = nlp.create_pipe('parser') parser = nlp.create_pipe("parser")
nlp.add_pipe(parser, first=True) nlp.add_pipe(parser, first=True)
# otherwise, get it, so we can add labels to it # otherwise, get it, so we can add labels to it
else: else:
parser = nlp.get_pipe('parser') parser = nlp.get_pipe("parser")
# add labels to the parser # add labels to the parser
for _, annotations in TRAIN_DATA: for _, annotations in TRAIN_DATA:
for dep in annotations.get('deps', []): for dep in annotations.get("deps", []):
parser.add_label(dep) parser.add_label(dep)
# get names of other pipes to disable them during training # get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser'] other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "parser"]
with nlp.disable_pipes(*other_pipes): # only train parser with nlp.disable_pipes(*other_pipes): # only train parser
optimizer = nlp.begin_training() optimizer = nlp.begin_training()
for itn in range(n_iter): for itn in range(n_iter):
random.shuffle(TRAIN_DATA) random.shuffle(TRAIN_DATA)
losses = {} losses = {}
# batch up the examples using spaCy's minibatch # batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001)) batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches: for batch in batches:
texts, annotations = zip(*batch) texts, annotations = zip(*batch)
nlp.update(texts, annotations, sgd=optimizer, losses=losses) nlp.update(texts, annotations, sgd=optimizer, losses=losses)
print('Losses', losses) print("Losses", losses)
# test the trained model # test the trained model
test_text = "I like securities." test_text = "I like securities."
doc = nlp(test_text) doc = nlp(test_text)
print('Dependencies', [(t.text, t.dep_, t.head.text) for t in doc]) print("Dependencies", [(t.text, t.dep_, t.head.text) for t in doc])
# save model to output directory # save model to output directory
if output_dir is not None: if output_dir is not None:
@ -87,10 +94,10 @@ def main(model=None, output_dir=None, n_iter=10):
print("Loading from", output_dir) print("Loading from", output_dir)
nlp2 = spacy.load(output_dir) nlp2 = spacy.load(output_dir)
doc = nlp2(test_text) doc = nlp2(test_text)
print('Dependencies', [(t.text, t.dep_, t.head.text) for t in doc]) print("Dependencies", [(t.text, t.dep_, t.head.text) for t in doc])
if __name__ == '__main__': if __name__ == "__main__":
plac.call(main) plac.call(main)
# expected result: # expected result:

View File

@ -25,11 +25,7 @@ from spacy.util import minibatch, compounding
# http://universaldependencies.github.io/docs/u/pos/index.html # http://universaldependencies.github.io/docs/u/pos/index.html
# You may also specify morphological features for your tags, from the universal # You may also specify morphological features for your tags, from the universal
# scheme. # scheme.
TAG_MAP = { TAG_MAP = {"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}, "J": {"pos": "ADJ"}}
'N': {'pos': 'NOUN'},
'V': {'pos': 'VERB'},
'J': {'pos': 'ADJ'}
}
# Usually you'll read this in, of course. Data formats vary. Ensure your # Usually you'll read this in, of course. Data formats vary. Ensure your
# strings are unicode and that the number of tags assigned matches spaCy's # strings are unicode and that the number of tags assigned matches spaCy's
@ -37,16 +33,17 @@ TAG_MAP = {
# that specifies the gold-standard tokenization, e.g.: # that specifies the gold-standard tokenization, e.g.:
# ("Eatblueham", {'words': ['Eat', 'blue', 'ham'] 'tags': ['V', 'J', 'N']}) # ("Eatblueham", {'words': ['Eat', 'blue', 'ham'] 'tags': ['V', 'J', 'N']})
TRAIN_DATA = [ TRAIN_DATA = [
("I like green eggs", {'tags': ['N', 'V', 'J', 'N']}), ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
("Eat blue ham", {'tags': ['V', 'J', 'N']}) ("Eat blue ham", {"tags": ["V", "J", "N"]}),
] ]
@plac.annotations( @plac.annotations(
lang=("ISO Code of language to use", "option", "l", str), lang=("ISO Code of language to use", "option", "l", str),
output_dir=("Optional output directory", "option", "o", Path), output_dir=("Optional output directory", "option", "o", Path),
n_iter=("Number of training iterations", "option", "n", int)) n_iter=("Number of training iterations", "option", "n", int),
def main(lang='en', output_dir=None, n_iter=25): )
def main(lang="en", output_dir=None, n_iter=25):
"""Create a new model, set up the pipeline and train the tagger. In order to """Create a new model, set up the pipeline and train the tagger. In order to
train the tagger with a custom tag map, we're creating a new Language train the tagger with a custom tag map, we're creating a new Language
instance with a custom vocab. instance with a custom vocab.
@ -54,7 +51,7 @@ def main(lang='en', output_dir=None, n_iter=25):
nlp = spacy.blank(lang) nlp = spacy.blank(lang)
# add the tagger to the pipeline # add the tagger to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy # nlp.create_pipe works for built-ins that are registered with spaCy
tagger = nlp.create_pipe('tagger') tagger = nlp.create_pipe("tagger")
# Add the tags. This needs to be done before you start training. # Add the tags. This needs to be done before you start training.
for tag, values in TAG_MAP.items(): for tag, values in TAG_MAP.items():
tagger.add_label(tag, values) tagger.add_label(tag, values)
@ -65,16 +62,16 @@ def main(lang='en', output_dir=None, n_iter=25):
random.shuffle(TRAIN_DATA) random.shuffle(TRAIN_DATA)
losses = {} losses = {}
# batch up the examples using spaCy's minibatch # batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001)) batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches: for batch in batches:
texts, annotations = zip(*batch) texts, annotations = zip(*batch)
nlp.update(texts, annotations, sgd=optimizer, losses=losses) nlp.update(texts, annotations, sgd=optimizer, losses=losses)
print('Losses', losses) print("Losses", losses)
# test the trained model # test the trained model
test_text = "I like blue eggs" test_text = "I like blue eggs"
doc = nlp(test_text) doc = nlp(test_text)
print('Tags', [(t.text, t.tag_, t.pos_) for t in doc]) print("Tags", [(t.text, t.tag_, t.pos_) for t in doc])
# save model to output directory # save model to output directory
if output_dir is not None: if output_dir is not None:
@ -88,10 +85,10 @@ def main(lang='en', output_dir=None, n_iter=25):
print("Loading from", output_dir) print("Loading from", output_dir)
nlp2 = spacy.load(output_dir) nlp2 = spacy.load(output_dir)
doc = nlp2(test_text) doc = nlp2(test_text)
print('Tags', [(t.text, t.tag_, t.pos_) for t in doc]) print("Tags", [(t.text, t.tag_, t.pos_) for t in doc])
if __name__ == '__main__': if __name__ == "__main__":
plac.call(main) plac.call(main)
# Expected output: # Expected output:

View File

@ -23,55 +23,62 @@ from spacy.util import minibatch, compounding
model=("Model name. Defaults to blank 'en' model.", "option", "m", str), model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
output_dir=("Optional output directory", "option", "o", Path), output_dir=("Optional output directory", "option", "o", Path),
n_texts=("Number of texts to train from", "option", "t", int), n_texts=("Number of texts to train from", "option", "t", int),
n_iter=("Number of training iterations", "option", "n", int)) n_iter=("Number of training iterations", "option", "n", int),
)
def main(model=None, output_dir=None, n_iter=20, n_texts=2000): def main(model=None, output_dir=None, n_iter=20, n_texts=2000):
if model is not None: if model is not None:
nlp = spacy.load(model) # load existing spaCy model nlp = spacy.load(model) # load existing spaCy model
print("Loaded model '%s'" % model) print("Loaded model '%s'" % model)
else: else:
nlp = spacy.blank('en') # create blank Language class nlp = spacy.blank("en") # create blank Language class
print("Created blank 'en' model") print("Created blank 'en' model")
# add the text classifier to the pipeline if it doesn't exist # add the text classifier to the pipeline if it doesn't exist
# nlp.create_pipe works for built-ins that are registered with spaCy # nlp.create_pipe works for built-ins that are registered with spaCy
if 'textcat' not in nlp.pipe_names: if "textcat" not in nlp.pipe_names:
textcat = nlp.create_pipe('textcat') textcat = nlp.create_pipe("textcat")
nlp.add_pipe(textcat, last=True) nlp.add_pipe(textcat, last=True)
# otherwise, get it, so we can add labels to it # otherwise, get it, so we can add labels to it
else: else:
textcat = nlp.get_pipe('textcat') textcat = nlp.get_pipe("textcat")
# add label to text classifier # add label to text classifier
textcat.add_label('POSITIVE') textcat.add_label("POSITIVE")
# load the IMDB dataset # load the IMDB dataset
print("Loading IMDB data...") print("Loading IMDB data...")
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts) (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)
print("Using {} examples ({} training, {} evaluation)" print(
.format(n_texts, len(train_texts), len(dev_texts))) "Using {} examples ({} training, {} evaluation)".format(
train_data = list(zip(train_texts, n_texts, len(train_texts), len(dev_texts)
[{'cats': cats} for cats in train_cats])) )
)
train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
# get names of other pipes to disable them during training # get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat'] other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
with nlp.disable_pipes(*other_pipes): # only train textcat with nlp.disable_pipes(*other_pipes): # only train textcat
optimizer = nlp.begin_training() optimizer = nlp.begin_training()
print("Training the model...") print("Training the model...")
print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F')) print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
for i in range(n_iter): for i in range(n_iter):
losses = {} losses = {}
# batch up the examples using spaCy's minibatch # batch up the examples using spaCy's minibatch
batches = minibatch(train_data, size=compounding(4., 32., 1.001)) batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
for batch in batches: for batch in batches:
texts, annotations = zip(*batch) texts, annotations = zip(*batch)
nlp.update(texts, annotations, sgd=optimizer, drop=0.2, nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
losses=losses)
with textcat.model.use_params(optimizer.averages): with textcat.model.use_params(optimizer.averages):
# evaluate on the dev data split off in load_data() # evaluate on the dev data split off in load_data()
scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats) scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}' # print a simple table print(
.format(losses['textcat'], scores['textcat_p'], "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format( # print a simple table
scores['textcat_r'], scores['textcat_f'])) losses["textcat"],
scores["textcat_p"],
scores["textcat_r"],
scores["textcat_f"],
)
)
# test the trained model # test the trained model
test_text = "This movie sucked" test_text = "This movie sucked"
@ -99,7 +106,7 @@ def load_data(limit=0, split=0.8):
random.shuffle(train_data) random.shuffle(train_data)
train_data = train_data[-limit:] train_data = train_data[-limit:]
texts, labels = zip(*train_data) texts, labels = zip(*train_data)
cats = [{'POSITIVE': bool(y)} for y in labels] cats = [{"POSITIVE": bool(y)} for y in labels]
split = int(len(train_data) * split) split = int(len(train_data) * split)
return (texts[:split], cats[:split]), (texts[split:], cats[split:]) return (texts[:split], cats[:split]), (texts[split:], cats[split:])
@ -116,9 +123,9 @@ def evaluate(tokenizer, textcat, texts, cats):
if label not in gold: if label not in gold:
continue continue
if score >= 0.5 and gold[label] >= 0.5: if score >= 0.5 and gold[label] >= 0.5:
tp += 1. tp += 1.0
elif score >= 0.5 and gold[label] < 0.5: elif score >= 0.5 and gold[label] < 0.5:
fp += 1. fp += 1.0
elif score < 0.5 and gold[label] < 0.5: elif score < 0.5 and gold[label] < 0.5:
tn += 1 tn += 1
elif score < 0.5 and gold[label] >= 0.5: elif score < 0.5 and gold[label] >= 0.5:
@ -126,8 +133,8 @@ def evaluate(tokenizer, textcat, texts, cats):
precision = tp / (tp + fp) precision = tp / (tp + fp)
recall = tp / (tp + fn) recall = tp / (tp + fn)
f_score = 2 * (precision * recall) / (precision + recall) f_score = 2 * (precision * recall) / (precision + recall)
return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score} return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}
if __name__ == '__main__': if __name__ == "__main__":
plac.call(main) plac.call(main)

View File

@ -14,8 +14,13 @@ from spacy.language import Language
@plac.annotations( @plac.annotations(
vectors_loc=("Path to .vec file", "positional", None, str), vectors_loc=("Path to .vec file", "positional", None, str),
lang=("Optional language ID. If not set, blank Language() will be used.", lang=(
"positional", None, str)) "Optional language ID. If not set, blank Language() will be used.",
"positional",
None,
str,
),
)
def main(vectors_loc, lang=None): def main(vectors_loc, lang=None):
if lang is None: if lang is None:
nlp = Language() nlp = Language()
@ -24,21 +29,21 @@ def main(vectors_loc, lang=None):
# save the model to disk and load it back later (models always need a # save the model to disk and load it back later (models always need a
# "lang" setting). Use 'xx' for blank multi-language class. # "lang" setting). Use 'xx' for blank multi-language class.
nlp = spacy.blank(lang) nlp = spacy.blank(lang)
with open(vectors_loc, 'rb') as file_: with open(vectors_loc, "rb") as file_:
header = file_.readline() header = file_.readline()
nr_row, nr_dim = header.split() nr_row, nr_dim = header.split()
nlp.vocab.reset_vectors(width=int(nr_dim)) nlp.vocab.reset_vectors(width=int(nr_dim))
for line in file_: for line in file_:
line = line.rstrip().decode('utf8') line = line.rstrip().decode("utf8")
pieces = line.rsplit(' ', int(nr_dim)) pieces = line.rsplit(" ", int(nr_dim))
word = pieces[0] word = pieces[0]
vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f') vector = numpy.asarray([float(v) for v in pieces[1:]], dtype="f")
nlp.vocab.set_vector(word, vector) # add the vectors to the vocab nlp.vocab.set_vector(word, vector) # add the vectors to the vocab
# test the vectors and similarity # test the vectors and similarity
text = 'class colspan' text = "class colspan"
doc = nlp(text) doc = nlp(text)
print(text, doc[0].similarity(doc[1])) print(text, doc[0].similarity(doc[1]))
if __name__ == '__main__': if __name__ == "__main__":
plac.call(main) plac.call(main)

View File

@ -14,26 +14,45 @@ import plac
import spacy import spacy
import tensorflow as tf import tensorflow as tf
import tqdm import tqdm
from tensorflow.contrib.tensorboard.plugins.projector import visualize_embeddings, ProjectorConfig from tensorflow.contrib.tensorboard.plugins.projector import (
visualize_embeddings,
ProjectorConfig,
)
@plac.annotations( @plac.annotations(
vectors_loc=("Path to spaCy model that contains vectors", "positional", None, str), vectors_loc=("Path to spaCy model that contains vectors", "positional", None, str),
out_loc=("Path to output folder for tensorboard session data", "positional", None, str), out_loc=(
name=("Human readable name for tsv file and vectors tensor", "positional", None, str), "Path to output folder for tensorboard session data",
"positional",
None,
str,
),
name=(
"Human readable name for tsv file and vectors tensor",
"positional",
None,
str,
),
) )
def main(vectors_loc, out_loc, name="spaCy_vectors"): def main(vectors_loc, out_loc, name="spaCy_vectors"):
meta_file = "{}.tsv".format(name) meta_file = "{}.tsv".format(name)
out_meta_file = path.join(out_loc, meta_file) out_meta_file = path.join(out_loc, meta_file)
print('Loading spaCy vectors model: {}'.format(vectors_loc)) print("Loading spaCy vectors model: {}".format(vectors_loc))
model = spacy.load(vectors_loc) model = spacy.load(vectors_loc)
print('Finding lexemes with vectors attached: {}'.format(vectors_loc)) print("Finding lexemes with vectors attached: {}".format(vectors_loc))
strings_stream = tqdm.tqdm(model.vocab.strings, total=len(model.vocab.strings), leave=False) strings_stream = tqdm.tqdm(
model.vocab.strings, total=len(model.vocab.strings), leave=False
)
queries = [w for w in strings_stream if model.vocab.has_vector(w)] queries = [w for w in strings_stream if model.vocab.has_vector(w)]
vector_count = len(queries) vector_count = len(queries)
print('Building Tensorboard Projector metadata for ({}) vectors: {}'.format(vector_count, out_meta_file)) print(
"Building Tensorboard Projector metadata for ({}) vectors: {}".format(
vector_count, out_meta_file
)
)
# Store vector data in a tensorflow variable # Store vector data in a tensorflow variable
tf_vectors_variable = numpy.zeros((vector_count, model.vocab.vectors.shape[1])) tf_vectors_variable = numpy.zeros((vector_count, model.vocab.vectors.shape[1]))
@ -41,22 +60,26 @@ def main(vectors_loc, out_loc, name="spaCy_vectors"):
# Write a tab-separated file that contains information about the vectors for visualization # Write a tab-separated file that contains information about the vectors for visualization
# #
# Reference: https://www.tensorflow.org/programmers_guide/embedding#metadata # Reference: https://www.tensorflow.org/programmers_guide/embedding#metadata
with open(out_meta_file, 'wb') as file_metadata: with open(out_meta_file, "wb") as file_metadata:
# Define columns in the first row # Define columns in the first row
file_metadata.write("Text\tFrequency\n".encode('utf-8')) file_metadata.write("Text\tFrequency\n".encode("utf-8"))
# Write out a row for each vector that we add to the tensorflow variable we created # Write out a row for each vector that we add to the tensorflow variable we created
vec_index = 0 vec_index = 0
for text in tqdm.tqdm(queries, total=len(queries), leave=False): for text in tqdm.tqdm(queries, total=len(queries), leave=False):
# https://github.com/tensorflow/tensorflow/issues/9094 # https://github.com/tensorflow/tensorflow/issues/9094
text = '<Space>' if text.lstrip() == '' else text text = "<Space>" if text.lstrip() == "" else text
lex = model.vocab[text] lex = model.vocab[text]
# Store vector data and metadata # Store vector data and metadata
tf_vectors_variable[vec_index] = model.vocab.get_vector(text) tf_vectors_variable[vec_index] = model.vocab.get_vector(text)
file_metadata.write("{}\t{}\n".format(text, math.exp(lex.prob) * vector_count).encode('utf-8')) file_metadata.write(
"{}\t{}\n".format(text, math.exp(lex.prob) * vector_count).encode(
"utf-8"
)
)
vec_index += 1 vec_index += 1
print('Running Tensorflow Session...') print("Running Tensorflow Session...")
sess = tf.InteractiveSession() sess = tf.InteractiveSession()
tf.Variable(tf_vectors_variable, trainable=False, name=name) tf.Variable(tf_vectors_variable, trainable=False, name=name)
tf.global_variables_initializer().run() tf.global_variables_initializer().run()
@ -73,10 +96,10 @@ def main(vectors_loc, out_loc, name="spaCy_vectors"):
visualize_embeddings(writer, config) visualize_embeddings(writer, config)
# Save session and print run command to the output # Save session and print run command to the output
print('Saving Tensorboard Session...') print("Saving Tensorboard Session...")
saver.save(sess, path.join(out_loc, '{}.ckpt'.format(name))) saver.save(sess, path.join(out_loc, "{}.ckpt".format(name)))
print('Done. Run `tensorboard --logdir={0}` to view in Tensorboard'.format(out_loc)) print("Done. Run `tensorboard --logdir={0}` to view in Tensorboard".format(out_loc))
if __name__ == '__main__': if __name__ == "__main__":
plac.call(main) plac.call(main)