2016-10-16 22:34:57 +03:00
|
|
|
from __future__ import unicode_literals, print_function
|
|
|
|
import json
|
|
|
|
import pathlib
|
|
|
|
import random
|
|
|
|
|
2017-05-31 14:42:12 +03:00
|
|
|
import spacy.lang.en
|
|
|
|
from spacy.gold import GoldParse, biluo_tags_from_offsets
|
2016-10-16 22:34:57 +03:00
|
|
|
|
2017-01-27 14:27:10 +03:00
|
|
|
|
2017-05-31 14:42:12 +03:00
|
|
|
def reformat_train_data(tokenizer, examples):
|
|
|
|
"""Reformat data to match JSON format"""
|
|
|
|
output = []
|
|
|
|
for i, (text, entity_offsets) in enumerate(examples):
|
|
|
|
doc = tokenizer(text)
|
|
|
|
ner_tags = biluo_tags_from_offsets(tokenizer(text), entity_offsets)
|
|
|
|
words = [w.text for w in doc]
|
|
|
|
tags = ['-'] * len(doc)
|
|
|
|
heads = [0] * len(doc)
|
|
|
|
deps = [''] * len(doc)
|
|
|
|
sentence = (range(len(doc)), words, tags, heads, deps, ner_tags)
|
|
|
|
output.append((text, [(sentence, [])]))
|
|
|
|
return output
|
2016-10-16 22:34:57 +03:00
|
|
|
|
|
|
|
|
2016-12-12 23:09:49 +03:00
|
|
|
def main(model_dir=None):
|
2016-10-16 22:34:57 +03:00
|
|
|
train_data = [
|
|
|
|
(
|
|
|
|
'Who is Shaka Khan?',
|
|
|
|
[(len('Who is '), len('Who is Shaka Khan'), 'PERSON')]
|
|
|
|
),
|
|
|
|
(
|
|
|
|
'I like London and Berlin.',
|
|
|
|
[(len('I like '), len('I like London'), 'LOC'),
|
|
|
|
(len('I like London and '), len('I like London and Berlin'), 'LOC')]
|
|
|
|
)
|
|
|
|
]
|
2017-05-31 14:42:12 +03:00
|
|
|
nlp = spacy.lang.en.English(pipeline=['tensorizer', 'ner'])
|
|
|
|
get_data = lambda: reformat_train_data(nlp.tokenizer, train_data)
|
|
|
|
optimizer = nlp.begin_training(get_data)
|
|
|
|
for itn in range(100):
|
|
|
|
random.shuffle(train_data)
|
|
|
|
losses = {}
|
|
|
|
for raw_text, entity_offsets in train_data:
|
|
|
|
doc = nlp.make_doc(raw_text)
|
|
|
|
gold = GoldParse(doc, entities=entity_offsets)
|
|
|
|
nlp.update(
|
|
|
|
[doc], # Batch of Doc objects
|
|
|
|
[gold], # Batch of GoldParse objects
|
|
|
|
drop=0.5, # Dropout -- make it harder to memorise data
|
|
|
|
sgd=optimizer, # Callable to update weights
|
|
|
|
losses=losses)
|
|
|
|
print(losses)
|
|
|
|
print("Save to", model_dir)
|
|
|
|
nlp.to_disk(model_dir)
|
|
|
|
print("Load from", model_dir)
|
|
|
|
nlp = spacy.lang.en.English(pipeline=['tensorizer', 'ner'])
|
|
|
|
nlp.from_disk(model_dir)
|
|
|
|
for raw_text, _ in train_data:
|
|
|
|
doc = nlp(raw_text)
|
|
|
|
for word in doc:
|
|
|
|
print(word.text, word.ent_type_, word.ent_iob_)
|
2016-10-16 22:34:57 +03:00
|
|
|
|
|
|
|
if __name__ == '__main__':
|
2017-05-31 14:42:12 +03:00
|
|
|
import plac
|
|
|
|
plac.call(main)
|
2016-10-16 22:34:57 +03:00
|
|
|
# Who "" 2
|
|
|
|
# is "" 2
|
|
|
|
# Shaka "" PERSON 3
|
|
|
|
# Khan "" PERSON 1
|
|
|
|
# ? "" 2
|