mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-03 19:08:06 +03:00
various fixes in scripts - needs to be further tested
This commit is contained in:
parent
3c4f9e4cc4
commit
f7ad8e8c83
|
@ -14,7 +14,7 @@ import spacy
|
|||
import spacy.util
|
||||
from bin.ud import conll17_ud_eval
|
||||
from spacy.tokens import Token, Doc
|
||||
from spacy.gold import GoldParse, Example
|
||||
from spacy.gold import Example
|
||||
from spacy.util import compounding, minibatch, minibatch_by_words
|
||||
from spacy.syntax.nonproj import projectivize
|
||||
from spacy.matcher import Matcher
|
||||
|
@ -83,11 +83,11 @@ def read_data(
|
|||
sent["heads"].append(head)
|
||||
sent["deps"].append("ROOT" if dep == "root" else dep)
|
||||
sent["spaces"].append(space_after == "_")
|
||||
sent["entities"] = ["-"] * len(sent["words"])
|
||||
sent["entities"] = ["-"] * len(sent["words"]) # TODO: doc-level format
|
||||
sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"])
|
||||
if oracle_segments:
|
||||
docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
|
||||
golds.append(GoldParse(docs[-1], **sent))
|
||||
golds.append(sent)
|
||||
assert golds[-1].morphology is not None
|
||||
|
||||
sent_annots.append(sent)
|
||||
|
@ -151,28 +151,27 @@ def read_conllu(file_):
|
|||
|
||||
def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
|
||||
# Flatten the conll annotations, and adjust the head indices
|
||||
flat = defaultdict(list)
|
||||
gold = defaultdict(list)
|
||||
sent_starts = []
|
||||
for sent in sent_annots:
|
||||
flat["heads"].extend(len(flat["words"])+head for head in sent["heads"])
|
||||
gold["heads"].extend(len(gold["words"])+head for head in sent["heads"])
|
||||
for field in ["words", "tags", "deps", "morphology", "entities", "spaces"]:
|
||||
flat[field].extend(sent[field])
|
||||
gold[field].extend(sent[field])
|
||||
sent_starts.append(True)
|
||||
sent_starts.extend([False] * (len(sent["words"]) - 1))
|
||||
# Construct text if necessary
|
||||
assert len(flat["words"]) == len(flat["spaces"])
|
||||
assert len(gold["words"]) == len(gold["spaces"])
|
||||
if text is None:
|
||||
text = "".join(
|
||||
word + " " * space for word, space in zip(flat["words"], flat["spaces"])
|
||||
word + " " * space for word, space in zip(gold["words"], gold["spaces"])
|
||||
)
|
||||
doc = nlp.make_doc(text)
|
||||
flat.pop("spaces")
|
||||
gold = GoldParse(doc, **flat)
|
||||
gold.sent_starts = sent_starts
|
||||
gold.pop("spaces")
|
||||
gold["sent_starts"] = sent_starts
|
||||
for i in range(len(gold.heads)):
|
||||
if random.random() < drop_deps:
|
||||
gold.heads[i] = None
|
||||
gold.labels[i] = None
|
||||
gold["heads"][i] = None
|
||||
gold["labels"][i] = None
|
||||
|
||||
return doc, gold
|
||||
|
||||
|
@ -183,15 +182,10 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
|
|||
|
||||
|
||||
def golds_to_gold_data(docs, golds):
|
||||
"""Get out the training data format used by begin_training, given the
|
||||
GoldParse objects."""
|
||||
"""Get out the training data format used by begin_training"""
|
||||
data = []
|
||||
for doc, gold in zip(docs, golds):
|
||||
example = Example(doc=doc)
|
||||
example.add_doc_annotation(cats=gold.cats)
|
||||
token_annotation_dict = gold.orig.to_dict()
|
||||
example.add_token_annotation(**token_annotation_dict)
|
||||
example.goldparse = gold
|
||||
example = Example.from_dict(doc, gold)
|
||||
data.append(example)
|
||||
return data
|
||||
|
||||
|
@ -359,8 +353,8 @@ def initialize_pipeline(nlp, examples, config, device):
|
|||
nlp.parser.add_multitask_objective("tag")
|
||||
if config.multitask_sent:
|
||||
nlp.parser.add_multitask_objective("sent_start")
|
||||
for ex in examples:
|
||||
gold = ex.gold
|
||||
for eg in examples:
|
||||
gold = eg.gold
|
||||
for tag in gold.tags:
|
||||
if tag is not None:
|
||||
nlp.tagger.add_label(tag)
|
||||
|
@ -541,7 +535,7 @@ def main(
|
|||
else:
|
||||
batches = minibatch(examples, size=batch_sizes)
|
||||
losses = {}
|
||||
n_train_words = sum(len(ex.doc) for ex in examples)
|
||||
n_train_words = sum(len(eg.doc) for eg in examples)
|
||||
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
||||
for batch in batches:
|
||||
pbar.update(sum(len(ex.doc) for ex in batch))
|
||||
|
|
|
@ -12,7 +12,7 @@ import tqdm
|
|||
import spacy
|
||||
import spacy.util
|
||||
from spacy.tokens import Token, Doc
|
||||
from spacy.gold import GoldParse, Example
|
||||
from spacy.gold import Example
|
||||
from spacy.syntax.nonproj import projectivize
|
||||
from collections import defaultdict
|
||||
from spacy.matcher import Matcher
|
||||
|
@ -33,31 +33,6 @@ random.seed(0)
|
|||
numpy.random.seed(0)
|
||||
|
||||
|
||||
def minibatch_by_words(examples, size=5000):
|
||||
random.shuffle(examples)
|
||||
if isinstance(size, int):
|
||||
size_ = itertools.repeat(size)
|
||||
else:
|
||||
size_ = size
|
||||
examples = iter(examples)
|
||||
while True:
|
||||
batch_size = next(size_)
|
||||
batch = []
|
||||
while batch_size >= 0:
|
||||
try:
|
||||
example = next(examples)
|
||||
except StopIteration:
|
||||
if batch:
|
||||
yield batch
|
||||
return
|
||||
batch_size -= len(example.doc)
|
||||
batch.append(example)
|
||||
if batch:
|
||||
yield batch
|
||||
else:
|
||||
break
|
||||
|
||||
|
||||
################
|
||||
# Data reading #
|
||||
################
|
||||
|
@ -110,7 +85,7 @@ def read_data(
|
|||
sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"])
|
||||
if oracle_segments:
|
||||
docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
|
||||
golds.append(GoldParse(docs[-1], **sent))
|
||||
golds.append(sent)
|
||||
|
||||
sent_annots.append(sent)
|
||||
if raw_text and max_doc_length and len(sent_annots) >= max_doc_length:
|
||||
|
@ -159,20 +134,19 @@ def read_conllu(file_):
|
|||
|
||||
def _make_gold(nlp, text, sent_annots):
|
||||
# Flatten the conll annotations, and adjust the head indices
|
||||
flat = defaultdict(list)
|
||||
gold = defaultdict(list)
|
||||
for sent in sent_annots:
|
||||
flat["heads"].extend(len(flat["words"]) + head for head in sent["heads"])
|
||||
gold["heads"].extend(len(gold["words"]) + head for head in sent["heads"])
|
||||
for field in ["words", "tags", "deps", "entities", "spaces"]:
|
||||
flat[field].extend(sent[field])
|
||||
gold[field].extend(sent[field])
|
||||
# Construct text if necessary
|
||||
assert len(flat["words"]) == len(flat["spaces"])
|
||||
assert len(gold["words"]) == len(gold["spaces"])
|
||||
if text is None:
|
||||
text = "".join(
|
||||
word + " " * space for word, space in zip(flat["words"], flat["spaces"])
|
||||
word + " " * space for word, space in zip(gold["words"], gold["spaces"])
|
||||
)
|
||||
doc = nlp.make_doc(text)
|
||||
flat.pop("spaces")
|
||||
gold = GoldParse(doc, **flat)
|
||||
gold.pop("spaces")
|
||||
return doc, gold
|
||||
|
||||
|
||||
|
@ -182,15 +156,10 @@ def _make_gold(nlp, text, sent_annots):
|
|||
|
||||
|
||||
def golds_to_gold_data(docs, golds):
|
||||
"""Get out the training data format used by begin_training, given the
|
||||
GoldParse objects."""
|
||||
"""Get out the training data format used by begin_training."""
|
||||
data = []
|
||||
for doc, gold in zip(docs, golds):
|
||||
example = Example(doc=doc)
|
||||
example.add_doc_annotation(cats=gold.cats)
|
||||
token_annotation_dict = gold.orig.to_dict()
|
||||
example.add_token_annotation(**token_annotation_dict)
|
||||
example.goldparse = gold
|
||||
example = Example.from_dict(doc, gold)
|
||||
data.append(example)
|
||||
return data
|
||||
|
||||
|
@ -313,15 +282,15 @@ def initialize_pipeline(nlp, examples, config):
|
|||
nlp.parser.add_multitask_objective("sent_start")
|
||||
nlp.parser.moves.add_action(2, "subtok")
|
||||
nlp.add_pipe(nlp.create_pipe("tagger"))
|
||||
for ex in examples:
|
||||
for tag in ex.gold.tags:
|
||||
for eg in examples:
|
||||
for tag in eg.gold.tags:
|
||||
if tag is not None:
|
||||
nlp.tagger.add_label(tag)
|
||||
# Replace labels that didn't make the frequency cutoff
|
||||
actions = set(nlp.parser.labels)
|
||||
label_set = set([act.split("-")[1] for act in actions if "-" in act])
|
||||
for ex in examples:
|
||||
gold = ex.gold
|
||||
for eg in examples:
|
||||
gold = eg.gold
|
||||
for i, label in enumerate(gold.labels):
|
||||
if label is not None and label not in label_set:
|
||||
gold.labels[i] = label.split("||")[0]
|
||||
|
@ -415,13 +384,12 @@ def main(ud_dir, parses_dir, config, corpus, limit=0):
|
|||
optimizer = initialize_pipeline(nlp, examples, config)
|
||||
|
||||
for i in range(config.nr_epoch):
|
||||
docs = [nlp.make_doc(example.doc.text) for example in examples]
|
||||
batches = minibatch_by_words(examples, size=config.batch_size)
|
||||
batches = spacy.minibatch_by_words(examples, size=config.batch_size)
|
||||
losses = {}
|
||||
n_train_words = sum(len(doc) for doc in docs)
|
||||
n_train_words = sum(len(eg.reference.doc) for eg in examples)
|
||||
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
||||
for batch in batches:
|
||||
pbar.update(sum(len(ex.doc) for ex in batch))
|
||||
pbar.update(sum(len(eg.reference.doc) for eg in batch))
|
||||
nlp.update(
|
||||
examples=batch, sgd=optimizer, drop=config.dropout, losses=losses,
|
||||
)
|
||||
|
|
|
@ -24,8 +24,10 @@ import random
|
|||
import plac
|
||||
import spacy
|
||||
import os.path
|
||||
|
||||
from spacy.gold.example import Example
|
||||
from spacy.tokens import Doc
|
||||
from spacy.gold import read_json_file, GoldParse
|
||||
from spacy.gold import read_json_file
|
||||
|
||||
random.seed(0)
|
||||
|
||||
|
@ -59,27 +61,25 @@ def main(n_iter=10):
|
|||
print(nlp.pipeline)
|
||||
|
||||
print("Create data", len(TRAIN_DATA))
|
||||
optimizer = nlp.begin_training(get_examples=lambda: TRAIN_DATA)
|
||||
optimizer = nlp.begin_training()
|
||||
for itn in range(n_iter):
|
||||
random.shuffle(TRAIN_DATA)
|
||||
losses = {}
|
||||
for example in TRAIN_DATA:
|
||||
for token_annotation in example.token_annotations:
|
||||
doc = Doc(nlp.vocab, words=token_annotation.words)
|
||||
gold = GoldParse.from_annotation(doc, example.doc_annotation, token_annotation)
|
||||
|
||||
nlp.update(
|
||||
examples=[(doc, gold)], # 1 example
|
||||
drop=0.2, # dropout - make it harder to memorise data
|
||||
sgd=optimizer, # callable to update weights
|
||||
losses=losses,
|
||||
)
|
||||
for example_dict in TRAIN_DATA:
|
||||
doc = Doc(nlp.vocab, words=example_dict["words"])
|
||||
example = Example.from_dict(doc, example_dict)
|
||||
nlp.update(
|
||||
examples=[example], # 1 example
|
||||
drop=0.2, # dropout - make it harder to memorise data
|
||||
sgd=optimizer, # callable to update weights
|
||||
losses=losses,
|
||||
)
|
||||
print(losses.get("nn_labeller", 0.0), losses["ner"])
|
||||
|
||||
# test the trained model
|
||||
for example in TRAIN_DATA:
|
||||
if example.text is not None:
|
||||
doc = nlp(example.text)
|
||||
for example_dict in TRAIN_DATA:
|
||||
if "text" in example_dict:
|
||||
doc = nlp(example_dict["text"])
|
||||
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
|
||||
print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
|
||||
|
||||
|
|
|
@ -1,8 +1,7 @@
|
|||
import re
|
||||
|
||||
from ...gold import Example
|
||||
from ...gold import iob_to_biluo, spans_from_biluo_tags, biluo_tags_from_offsets
|
||||
from ...gold import TokenAnnotation
|
||||
from ...gold import iob_to_biluo, spans_from_biluo_tags
|
||||
from ...language import Language
|
||||
from ...tokens import Doc, Token
|
||||
from .conll_ner2json import n_sents_info
|
||||
|
@ -42,10 +41,10 @@ def conllu2json(
|
|||
)
|
||||
has_ner_tags = has_ner(input_data, MISC_NER_PATTERN)
|
||||
for i, example in enumerate(conll_data):
|
||||
raw += example.text
|
||||
raw += example.predicted.text
|
||||
sentences.append(
|
||||
generate_sentence(
|
||||
example.token_annotation,
|
||||
example,
|
||||
has_ner_tags,
|
||||
MISC_NER_PATTERN,
|
||||
ner_map=ner_map,
|
||||
|
@ -268,36 +267,14 @@ def example_from_conllu_sentence(
|
|||
doc = merge_conllu_subtokens(lines, doc)
|
||||
|
||||
# create Example from custom Doc annotation
|
||||
ids, words, tags, heads, deps = [], [], [], [], []
|
||||
pos, lemmas, morphs, spaces = [], [], [], []
|
||||
words, spaces = [], []
|
||||
for i, t in enumerate(doc):
|
||||
ids.append(i)
|
||||
words.append(t._.merged_orth)
|
||||
if append_morphology and t._.merged_morph:
|
||||
tags.append(t.tag_ + "__" + t._.merged_morph)
|
||||
else:
|
||||
tags.append(t.tag_)
|
||||
pos.append(t.pos_)
|
||||
morphs.append(t._.merged_morph)
|
||||
lemmas.append(t._.merged_lemma)
|
||||
heads.append(t.head.i)
|
||||
deps.append(t.dep_)
|
||||
spaces.append(t._.merged_spaceafter)
|
||||
ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
|
||||
ents = biluo_tags_from_offsets(doc, ent_offsets)
|
||||
example = Example(doc=Doc(vocab, words=words, spaces=spaces))
|
||||
example.token_annotation = TokenAnnotation(
|
||||
ids=ids,
|
||||
words=words,
|
||||
tags=tags,
|
||||
pos=pos,
|
||||
morphs=morphs,
|
||||
lemmas=lemmas,
|
||||
heads=heads,
|
||||
deps=deps,
|
||||
entities=ents,
|
||||
)
|
||||
return example
|
||||
if append_morphology and t._.merged_morph:
|
||||
t.tag_ = t.tag_ + "__" + t._.merged_morph
|
||||
|
||||
return Example(predicted=Doc(vocab, words=words, spaces=spaces), reference=doc)
|
||||
|
||||
|
||||
def merge_conllu_subtokens(lines, doc):
|
||||
|
|
|
@ -69,6 +69,7 @@ def docs_to_json(docs, id=0, ner_missing_tag="O"):
|
|||
|
||||
|
||||
def read_json_file(loc, docs_filter=None, limit=None):
|
||||
"""Read Example dictionaries from a json file or directory."""
|
||||
loc = util.ensure_path(loc)
|
||||
if loc.is_dir():
|
||||
for filename in loc.iterdir():
|
||||
|
@ -105,7 +106,7 @@ def json_to_annotations(doc):
|
|||
sent_start_i = len(words)
|
||||
for i, token in enumerate(sent["tokens"]):
|
||||
words.append(token["orth"])
|
||||
spaces.append(token["space"])
|
||||
spaces.append(token.get("space", True))
|
||||
ids.append(token.get('id', sent_start_i + i))
|
||||
tags.append(token.get('tag', "-"))
|
||||
pos.append(token.get("pos", ""))
|
||||
|
|
|
@ -804,7 +804,6 @@ class Language(object):
|
|||
cleanup=False,
|
||||
component_cfg=None,
|
||||
n_process=1,
|
||||
as_example=False,
|
||||
):
|
||||
"""Process texts as a stream, and yield `Doc` objects in order.
|
||||
|
||||
|
@ -837,8 +836,7 @@ class Language(object):
|
|||
batch_size=batch_size,
|
||||
disable=disable,
|
||||
n_process=n_process,
|
||||
component_cfg=component_cfg,
|
||||
as_example=as_example,
|
||||
component_cfg=component_cfg
|
||||
)
|
||||
for doc, context in zip(docs, contexts):
|
||||
yield (doc, context)
|
||||
|
|
|
@ -26,7 +26,7 @@ def test_sentencizer_pipe():
|
|||
sent_starts = [t.is_sent_start for t in doc]
|
||||
assert sent_starts == [True, False, True, False, False, False, False]
|
||||
assert len(list(doc.sents)) == 2
|
||||
for ex in nlp.pipe(texts, as_example=True):
|
||||
for ex in nlp.pipe(texts):
|
||||
doc = ex.doc
|
||||
assert doc.is_sentenced
|
||||
sent_starts = [t.is_sent_start for t in doc]
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
import pytest
|
||||
from spacy.gold import Example
|
||||
|
||||
from .util import get_random_doc
|
||||
|
||||
|
|
|
@ -205,7 +205,7 @@ cdef class Tokenizer:
|
|||
doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws
|
||||
return doc
|
||||
|
||||
def pipe(self, texts, batch_size=1000, n_threads=-1, as_example=False):
|
||||
def pipe(self, texts, batch_size=1000, n_threads=-1):
|
||||
"""Tokenize a stream of texts.
|
||||
|
||||
texts: A sequence of unicode texts.
|
||||
|
|
Loading…
Reference in New Issue
Block a user