mirror of
https://github.com/explosion/spaCy.git
synced 2025-10-02 18:06:46 +03:00
Support fused tokens (rattily)
This commit is contained in:
parent
9db60acd7c
commit
7c5bde3f8c
|
@ -18,6 +18,7 @@ from spacy.syntax.nonproj import projectivize
|
||||||
from collections import defaultdict, Counter
|
from collections import defaultdict, Counter
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
from spacy.matcher import Matcher
|
from spacy.matcher import Matcher
|
||||||
|
from spacy.morphology import Fused_begin, Fused_inside
|
||||||
|
|
||||||
import itertools
|
import itertools
|
||||||
import random
|
import random
|
||||||
|
@ -84,18 +85,28 @@ def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False,
|
||||||
sent_annots = []
|
sent_annots = []
|
||||||
for cs in cd:
|
for cs in cd:
|
||||||
sent = defaultdict(list)
|
sent = defaultdict(list)
|
||||||
|
fused_ids = set()
|
||||||
for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs:
|
for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs:
|
||||||
if '.' in id_:
|
if '.' in id_:
|
||||||
continue
|
continue
|
||||||
if '-' in id_:
|
if '-' in id_:
|
||||||
|
fuse_start, fuse_end = id_.split('-')
|
||||||
|
for sub_id in range(int(fuse_start), int(fuse_end)+1):
|
||||||
|
fused_ids.add(str(sub_id))
|
||||||
|
sent['tokens'].append(word)
|
||||||
continue
|
continue
|
||||||
|
if id_ not in fused_ids:
|
||||||
|
sent['tokens'].append(word)
|
||||||
|
if space_after == '_':
|
||||||
|
sent['tokens'][-1] += ' '
|
||||||
|
elif id_ == fuse_end and space_after == '_':
|
||||||
|
sent['tokens'][-1] += ' '
|
||||||
id_ = int(id_)-1
|
id_ = int(id_)-1
|
||||||
head = int(head)-1 if head != '0' else id_
|
head = int(head)-1 if head != '0' else id_
|
||||||
sent['words'].append(word)
|
sent['words'].append(word)
|
||||||
sent['tags'].append(tag)
|
sent['tags'].append(tag)
|
||||||
sent['heads'].append(head)
|
sent['heads'].append(head)
|
||||||
sent['deps'].append('ROOT' if dep == 'root' else dep)
|
sent['deps'].append('ROOT' if dep == 'root' else dep)
|
||||||
sent['spaces'].append(space_after == '_')
|
|
||||||
sent['entities'] = ['-'] * len(sent['words'])
|
sent['entities'] = ['-'] * len(sent['words'])
|
||||||
sent['heads'], sent['deps'] = projectivize(sent['heads'],
|
sent['heads'], sent['deps'] = projectivize(sent['heads'],
|
||||||
sent['deps'])
|
sent['deps'])
|
||||||
|
@ -153,14 +164,13 @@ def _make_gold(nlp, text, sent_annots):
|
||||||
flat = defaultdict(list)
|
flat = defaultdict(list)
|
||||||
for sent in sent_annots:
|
for sent in sent_annots:
|
||||||
flat['heads'].extend(len(flat['words'])+head for head in sent['heads'])
|
flat['heads'].extend(len(flat['words'])+head for head in sent['heads'])
|
||||||
for field in ['words', 'tags', 'deps', 'entities', 'spaces']:
|
for field in ['words', 'tags', 'deps', 'entities', 'tokens']:
|
||||||
flat[field].extend(sent[field])
|
flat[field].extend(sent[field])
|
||||||
# Construct text if necessary
|
# Construct text if necessary
|
||||||
assert len(flat['words']) == len(flat['spaces'])
|
|
||||||
if text is None:
|
if text is None:
|
||||||
text = ''.join(word+' '*space for word, space in zip(flat['words'], flat['spaces']))
|
text = ''.join(flat['tokens'])
|
||||||
doc = nlp.make_doc(text)
|
doc = nlp.make_doc(text)
|
||||||
flat.pop('spaces')
|
flat.pop('tokens')
|
||||||
gold = GoldParse(doc, **flat)
|
gold = GoldParse(doc, **flat)
|
||||||
return doc, gold
|
return doc, gold
|
||||||
|
|
||||||
|
@ -210,12 +220,39 @@ def write_conllu(docs, file_):
|
||||||
file_.write("# newdoc id = {i}\n".format(i=i))
|
file_.write("# newdoc id = {i}\n".format(i=i))
|
||||||
for j, sent in enumerate(doc.sents):
|
for j, sent in enumerate(doc.sents):
|
||||||
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
|
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
|
||||||
file_.write("# text = {text}\n".format(text=sent.text))
|
file_.write('# text = {text}\n'.format(text=sent.text))
|
||||||
for k, token in enumerate(sent):
|
for k, token in enumerate(sent):
|
||||||
file_.write(token._.get_conllu_lines(k) + '\n')
|
file_.write(_get_token_conllu(token, k, len(sent)) + '\n')
|
||||||
file_.write('\n')
|
file_.write('\n')
|
||||||
|
|
||||||
|
|
||||||
|
def _get_token_conllu(token, k, sent_len):
|
||||||
|
if token.check_morph(Fused_begin) and (k+1 < sent_len):
|
||||||
|
n = 1
|
||||||
|
text = [token.text]
|
||||||
|
while token.nbor(n).check_morph(Fused_inside):
|
||||||
|
text.append(token.nbor(n).text)
|
||||||
|
n += 1
|
||||||
|
id_ = '%d-%d' % (k+1, (k+n))
|
||||||
|
fields = [id_, ''.join(text)] + ['_'] * 8
|
||||||
|
lines = ['\t'.join(fields)]
|
||||||
|
else:
|
||||||
|
lines = []
|
||||||
|
if token.head.i == token.i:
|
||||||
|
head = 0
|
||||||
|
else:
|
||||||
|
head = k + (token.head.i - token.i) + 1
|
||||||
|
fields = [str(k+1), token.text, token.lemma_, token.pos_, token.tag_, '_',
|
||||||
|
str(head), token.dep_.lower(), '_', '_']
|
||||||
|
if token.check_morph(Fused_begin) and (k+1 < sent_len):
|
||||||
|
if k == 0:
|
||||||
|
fields[1] = token.norm_[0].upper() + token.norm_[1:]
|
||||||
|
else:
|
||||||
|
fields[1] = token.norm_
|
||||||
|
lines.append('\t'.join(fields))
|
||||||
|
return '\n'.join(lines)
|
||||||
|
|
||||||
|
|
||||||
def print_progress(itn, losses, ud_scores):
|
def print_progress(itn, losses, ud_scores):
|
||||||
fields = {
|
fields = {
|
||||||
'dep_loss': losses.get('parser', 0.0),
|
'dep_loss': losses.get('parser', 0.0),
|
||||||
|
@ -240,31 +277,6 @@ def print_progress(itn, losses, ud_scores):
|
||||||
))
|
))
|
||||||
print(tpl.format(itn, **fields))
|
print(tpl.format(itn, **fields))
|
||||||
|
|
||||||
#def get_sent_conllu(sent, sent_id):
|
|
||||||
# lines = ["# sent_id = {sent_id}".format(sent_id=sent_id)]
|
|
||||||
|
|
||||||
def get_token_conllu(token, i):
|
|
||||||
if token._.begins_fused:
|
|
||||||
n = 1
|
|
||||||
while token.nbor(n)._.inside_fused:
|
|
||||||
n += 1
|
|
||||||
id_ = '%d-%d' % (k, k+n)
|
|
||||||
lines = [id_, token.text, '_', '_', '_', '_', '_', '_', '_', '_']
|
|
||||||
else:
|
|
||||||
lines = []
|
|
||||||
if token.head.i == token.i:
|
|
||||||
head = 0
|
|
||||||
else:
|
|
||||||
head = i + (token.head.i - token.i) + 1
|
|
||||||
fields = [str(i+1), token.text, token.lemma_, token.pos_, token.tag_, '_',
|
|
||||||
str(head), token.dep_.lower(), '_', '_']
|
|
||||||
lines.append('\t'.join(fields))
|
|
||||||
return '\n'.join(lines)
|
|
||||||
|
|
||||||
Token.set_extension('get_conllu_lines', method=get_token_conllu)
|
|
||||||
Token.set_extension('begins_fused', default=False)
|
|
||||||
Token.set_extension('inside_fused', default=False)
|
|
||||||
|
|
||||||
|
|
||||||
##################
|
##################
|
||||||
# Initialization #
|
# Initialization #
|
||||||
|
@ -278,14 +290,63 @@ def load_nlp(corpus, config):
|
||||||
nlp.vocab.from_disk(config.vectors / 'vocab')
|
nlp.vocab.from_disk(config.vectors / 'vocab')
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
def extract_tokenizer_exceptions(paths):
|
||||||
|
with paths.train.conllu.open() as file_:
|
||||||
|
conllu = read_conllu(file_)
|
||||||
|
fused = defaultdict(lambda: defaultdict(list))
|
||||||
|
for doc in conllu:
|
||||||
|
for sent in doc:
|
||||||
|
for i, token in enumerate(sent):
|
||||||
|
if '-' in token[0]:
|
||||||
|
start, end = token[0].split('-')
|
||||||
|
length = int(end) - int(start)
|
||||||
|
subtokens = sent[i+1 : i+1+length+1]
|
||||||
|
forms = [t[1].lower() for t in subtokens]
|
||||||
|
fused[token[1]][tuple(forms)].append(subtokens)
|
||||||
|
exc = {}
|
||||||
|
for word, expansions in fused.items():
|
||||||
|
by_freq = [(len(occurs), key, occurs) for key, occurs in expansions.items()]
|
||||||
|
freq, key, occurs = max(by_freq)
|
||||||
|
if word == ''.join(key):
|
||||||
|
# Happy case: we get a perfect split, with each letter accounted for.
|
||||||
|
analysis = [{'ORTH': subtoken} for subtoken in key]
|
||||||
|
elif len(word) == sum(len(subtoken) for subtoken in key):
|
||||||
|
# Unideal, but at least lengths match.
|
||||||
|
analysis = []
|
||||||
|
remain = word
|
||||||
|
for subtoken in key:
|
||||||
|
analysis.append({'ORTH': remain[:len(subtoken)]})
|
||||||
|
remain = remain[len(subtoken):]
|
||||||
|
assert len(remain) == 0, (word, key, remain)
|
||||||
|
else:
|
||||||
|
# Let's say word is 6 long, and there are three subtokens. The orths
|
||||||
|
# *must* equal the original string. Arbitrarily, split [4, 1, 1]
|
||||||
|
first = word[:len(word)-(len(key)-1)]
|
||||||
|
subtokens = [first]
|
||||||
|
remain = word[len(first):]
|
||||||
|
for i in range(1, len(key)):
|
||||||
|
subtokens.append(remain[:1])
|
||||||
|
remain = remain[1:]
|
||||||
|
assert len(remain) == 0, (word, subtokens, remain)
|
||||||
|
analysis = [{'ORTH': subtoken} for subtoken in subtokens]
|
||||||
|
for i, token in enumerate(occurs[0]):
|
||||||
|
analysis[i]['NORM'] = token[1]
|
||||||
|
analysis[0]['morphology'] = [Fused_begin]
|
||||||
|
for subtoken in analysis[1:]:
|
||||||
|
subtoken['morphology'] = [Fused_inside]
|
||||||
|
exc[word] = analysis
|
||||||
|
return exc
|
||||||
|
|
||||||
def initialize_pipeline(nlp, docs, golds, config):
|
def initialize_pipeline(nlp, docs, golds, config):
|
||||||
|
nlp.add_pipe(nlp.create_pipe('tagger'))
|
||||||
nlp.add_pipe(nlp.create_pipe('parser'))
|
nlp.add_pipe(nlp.create_pipe('parser'))
|
||||||
|
nlp.parser.moves.add_action(2, 'subtok')
|
||||||
if config.multitask_tag:
|
if config.multitask_tag:
|
||||||
nlp.parser.add_multitask_objective('tag')
|
nlp.parser.add_multitask_objective('tag')
|
||||||
if config.multitask_sent:
|
if config.multitask_sent:
|
||||||
nlp.parser.add_multitask_objective('sent_start')
|
nlp.parser.add_multitask_objective('sent_start')
|
||||||
nlp.parser.moves.add_action(2, 'subtok')
|
if config.multitask_dep:
|
||||||
nlp.add_pipe(nlp.create_pipe('tagger'))
|
nlp.parser.add_multitask_objective('dep')
|
||||||
for gold in golds:
|
for gold in golds:
|
||||||
for tag in gold.tags:
|
for tag in gold.tags:
|
||||||
if tag is not None:
|
if tag is not None:
|
||||||
|
@ -308,6 +369,7 @@ def initialize_pipeline(nlp, docs, golds, config):
|
||||||
class Config(object):
|
class Config(object):
|
||||||
vectors = attr.ib(default=None)
|
vectors = attr.ib(default=None)
|
||||||
max_doc_length = attr.ib(default=10)
|
max_doc_length = attr.ib(default=10)
|
||||||
|
multitask_dep = attr.ib(default=True)
|
||||||
multitask_tag = attr.ib(default=True)
|
multitask_tag = attr.ib(default=True)
|
||||||
multitask_sent = attr.ib(default=True)
|
multitask_sent = attr.ib(default=True)
|
||||||
nr_epoch = attr.ib(default=30)
|
nr_epoch = attr.ib(default=30)
|
||||||
|
@ -362,7 +424,9 @@ def main(ud_dir, parses_dir, config, corpus, limit=0):
|
||||||
(parses_dir / corpus).mkdir()
|
(parses_dir / corpus).mkdir()
|
||||||
print("Train and evaluate", corpus, "using lang", paths.lang)
|
print("Train and evaluate", corpus, "using lang", paths.lang)
|
||||||
nlp = load_nlp(paths.lang, config)
|
nlp = load_nlp(paths.lang, config)
|
||||||
|
tokenizer_exceptions = extract_tokenizer_exceptions(paths)
|
||||||
|
for orth, subtokens in tokenizer_exceptions.items():
|
||||||
|
nlp.tokenizer.add_special_case(orth, subtokens)
|
||||||
docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(),
|
docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(),
|
||||||
max_doc_length=config.max_doc_length, limit=limit)
|
max_doc_length=config.max_doc_length, limit=limit)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user