Add convert CLI option to merge CoNLL-U subtokens (#4722)

* Add convert CLI option to merge CoNLL-U subtokens

Add `-T` option to convert CLI that merges CoNLL-U subtokens into one
token in the converted data. Each CoNLL-U sentence is read into a `Doc`
and the `Retokenizer` is used to merge subtokens with features as
follows:

* `orth` is the merged token orth (should correspond to raw text and `#
text`)

* `tag` is all subtoken tags concatenated with `_`, e.g. `ADP_DET`

* `pos` is the POS of the syntactic root of the span (as determined by
the Retokenizer)

* `morph` is all morphological features merged

* `lemma` is all subtoken lemmas concatenated with ` `, e.g. `de o`

* with `-m` all morphological features are combined with the tag using
the separator `__`, e.g.
`ADP_DET__Definite=Def|Gender=Masc|Number=Sing|PronType=Art`

* `dep` is the dependency relation for the syntactic root of the span
(as determined by the Retokenizer)

Concatenated tags will be mapped to the UD POS of the syntactic root
(e.g., `ADP`) and the morphological features will be the combined
features.

In many cases, the original UD subtokens can be reconstructed from the
available features given a language-specific lookup table, e.g.,
Portuguese `do / ADP_DET /
Definite=Def|Gender=Masc|Number=Sing|PronType=Art` is `de / ADP`, `o /
DET / Definite=Def|Gender=Masc|Number=Sing|PronType=Art` or lookup rules
for forms containing open class words like Spanish `hablarlo / VERB_PRON
/
Case=Acc|Gender=Masc|Number=Sing|Person=3|PrepCase=Npr|PronType=Prs|VerbForm=Inf`.

* Clean up imports
This commit is contained in:
adrianeboyd 2020-01-29 17:44:25 +01:00 committed by GitHub
parent 569cc98982
commit a365359b36
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 273 additions and 98 deletions

View File

@ -34,6 +34,7 @@ def convert(
seg_sents: ("Segment sentences (for -c ner)", "flag", "s") = False,
model: ("Model for sentence segmentation (for -s)", "option", "b", str) = None,
morphology: ("Enable appending morphology to tags", "flag", "m", bool) = False,
merge_subtokens: ("Merge CoNLL-U subtokens", "flag", "T", bool) = False,
converter: (f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str) = "auto",
ner_map_path: ("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path) = None,
lang: ("Language (if tokenizer required)", "option", "l", str) = None,
@ -85,7 +86,8 @@ def convert(
input_data,
n_sents=n_sents,
seg_sents=seg_sents,
use_morphology=morphology,
append_morphology=morphology,
merge_subtokens=merge_subtokens,
lang=lang,
model=model,
no_print=no_print,

View File

@ -1,36 +1,36 @@
import re
from spacy.gold import Example
from ...gold import iob_to_biluo
from ...gold import Example
from ...gold import iob_to_biluo, spans_from_biluo_tags, biluo_tags_from_offsets
from ...language import Language
from ...tokens import Doc, Token
from .conll_ner2json import n_sents_info
from wasabi import Printer
def conllu2json(
input_data, n_sents=10, use_morphology=False, lang=None, ner_map=None, **_
input_data, n_sents=10, append_morphology=False, lang=None, ner_map=None,
merge_subtokens=False, no_print=False, **_
):
"""
Convert conllu files into JSON format for use with train cli.
use_morphology parameter enables appending morphology to tags, which is
append_morphology parameter enables appending morphology to tags, which is
useful for languages such as Spanish, where UD tags are not so rich.
Extract NER tags if available and convert them so that they follow
BILUO and the Wikipedia scheme
"""
# by @dvsrepo, via #11 explosion/spacy-dev-resources
# by @katarkor
# name=NER is to handle NorNE
MISC_NER_PATTERN = "\|?(?:name=)?(([A-Z_]+)-([A-Z_]+)|O)\|?"
msg = Printer(no_print=no_print)
n_sents_info(msg, n_sents)
docs = []
raw = ""
sentences = []
conll_data = read_conllx(input_data, use_morphology=use_morphology)
checked_for_ner = False
has_ner_tags = False
conll_data = read_conllx(input_data, append_morphology=append_morphology,
ner_tag_pattern=MISC_NER_PATTERN, ner_map=ner_map,
merge_subtokens=merge_subtokens)
has_ner_tags = has_ner(input_data, ner_tag_pattern=MISC_NER_PATTERN)
for i, example in enumerate(conll_data):
if not checked_for_ner:
has_ner_tags = is_ner(
example.token_annotation.entities[0], MISC_NER_PATTERN
)
checked_for_ner = True
raw += example.text
sentences.append(
generate_sentence(
@ -43,137 +43,273 @@ def conllu2json(
# Real-sized documents could be extracted using the comments on the
# conllu document
if len(sentences) % n_sents == 0:
doc = create_doc(raw, sentences, i)
doc = create_json_doc(raw, sentences, i)
docs.append(doc)
raw = ""
sentences = []
if sentences:
doc = create_doc(raw, sentences, i)
doc = create_json_doc(raw, sentences, i)
docs.append(doc)
return docs
def is_ner(tag, tag_pattern):
def has_ner(input_data, ner_tag_pattern):
"""
Check the 10th column of the first token to determine if the file contains
NER tags
"""
tag_match = re.search(tag_pattern, tag)
if tag_match:
return True
elif tag == "O":
return True
else:
return False
for sent in input_data.strip().split("\n\n"):
lines = sent.strip().split("\n")
if lines:
while lines[0].startswith("#"):
lines.pop(0)
if lines:
parts = lines[0].split("\t")
id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
if re.search(ner_tag_pattern, misc):
return True
else:
return False
def read_conllx(input_data, use_morphology=False, n=0):
""" Yield example data points, one for each sentence """
def read_conllx(input_data, append_morphology=False, merge_subtokens=False,
ner_tag_pattern="", ner_map=None):
""" Yield examples, one for each sentence """
vocab = Language.Defaults.create_vocab() # need vocab to make a minimal Doc
i = 0
for sent in input_data.strip().split("\n\n"):
lines = sent.strip().split("\n")
if lines:
while lines[0].startswith("#"):
lines.pop(0)
ids, words, tags, heads, deps, ents = [], [], [], [], [], []
spaces = []
for line in lines:
parts = line.split("\t")
id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
if "-" in id_ or "." in id_:
continue
try:
id_ = int(id_) - 1
head = (int(head) - 1) if head != "0" else id_
dep = "ROOT" if dep == "root" else dep
tag = pos if tag == "_" else tag
tag = tag + "__" + morph if use_morphology else tag
ent = misc if misc else "O"
ids.append(id_)
words.append(word)
tags.append(tag)
heads.append(head)
deps.append(dep)
ents.append(ent)
if "SpaceAfter=No" in misc:
spaces.append(False)
else:
spaces.append(True)
except: # noqa: E722
print(line)
raise
raw = ""
for word, space in zip(words, spaces):
raw += word
if space:
raw += " "
example = Example(doc=raw)
example.set_token_annotation(
ids=ids, words=words, tags=tags, heads=heads, deps=deps, entities=ents
)
example = example_from_conllu_sentence(vocab, lines,
ner_tag_pattern, merge_subtokens=merge_subtokens,
append_morphology=append_morphology,
ner_map=ner_map)
yield example
i += 1
if 1 <= n <= i:
break
def extract_tags(iob, tag_pattern, ner_map=None):
def get_entities(lines, tag_pattern, ner_map=None):
"""Find entities in the MISC column according to the pattern and map to
final entity type with `ner_map` if mapping present. Entity tag is 'O' if
the pattern is not matched.
lines (unicode): CONLL-U lines for one sentences
tag_pattern (unicode): Regex pattern for entity tag
ner_map (dict): Map old NER tag names to new ones, '' maps to O.
RETURNS (list): List of BILUO entity tags
"""
Extract tag from MISC column according to `tag_pattern` and map to final
entity type with `ner_map` if mapping present.
miscs = []
for line in lines:
parts = line.split("\t")
id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
if "-" in id_ or "." in id_:
continue
miscs.append(misc)
For NorNE:
Simplify tags obtained from the dataset in order to follow Wikipedia
scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while
'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to
'MISC'.
"""
new_iob = []
for tag in iob:
tag_match = re.search(tag_pattern, tag)
new_tag = "O"
iob = []
for misc in miscs:
tag_match = re.search(tag_pattern, misc)
iob_tag = "O"
if tag_match:
prefix = tag_match.group(2)
suffix = tag_match.group(3)
if prefix and suffix:
new_tag = prefix + "-" + suffix
iob_tag = prefix + "-" + suffix
if ner_map:
suffix = ner_map.get(suffix, suffix)
if suffix == "":
new_tag = "O"
iob_tag = "O"
else:
new_tag = prefix + "-" + suffix
new_iob.append(new_tag)
return new_iob
iob_tag = prefix + "-" + suffix
iob.append(iob_tag)
return iob_to_biluo(iob)
def generate_sentence(token_annotation, has_ner_tags, tag_pattern, ner_map=None):
sentence = {}
tokens = []
if has_ner_tags:
iob = extract_tags(token_annotation.entities, tag_pattern, ner_map=ner_map)
biluo = iob_to_biluo(iob)
for i, id in enumerate(token_annotation.ids):
for i, id_ in enumerate(token_annotation.ids):
token = {}
token["id"] = id
token["orth"] = token_annotation.words[i]
token["tag"] = token_annotation.tags[i]
token["head"] = token_annotation.heads[i] - id
token["dep"] = token_annotation.deps[i]
token["id"] = id_
token["orth"] = token_annotation.get_word(i)
token["tag"] = token_annotation.get_tag(i)
token["pos"] = token_annotation.get_pos(i)
token["lemma"] = token_annotation.get_lemma(i)
token["morph"] = token_annotation.get_morph(i)
token["head"] = token_annotation.get_head(i) - id_
token["dep"] = token_annotation.get_dep(i)
if has_ner_tags:
token["ner"] = biluo[i]
token["ner"] = token_annotation.get_entity(i)
tokens.append(token)
sentence["tokens"] = tokens
return sentence
def create_doc(raw, sentences, id):
def create_json_doc(raw, sentences, id_):
doc = {}
paragraph = {}
doc["id"] = id
doc["id"] = id_
doc["paragraphs"] = []
paragraph["raw"] = raw.strip()
paragraph["sentences"] = sentences
doc["paragraphs"].append(paragraph)
return doc
def example_from_conllu_sentence(vocab, lines, ner_tag_pattern,
merge_subtokens=False, append_morphology=False, ner_map=None):
"""Create an Example from the lines for one CoNLL-U sentence, merging
subtokens and appending morphology to tags if required.
lines (unicode): The non-comment lines for a CoNLL-U sentence
ner_tag_pattern (unicode): The regex pattern for matching NER in MISC col
RETURNS (Example): An example containing the annotation
"""
# create a Doc with each subtoken as its own token
# if merging subtokens, each subtoken orth is the merged subtoken form
if not Token.has_extension("merged_orth"):
Token.set_extension("merged_orth", default="")
if not Token.has_extension("merged_lemma"):
Token.set_extension("merged_lemma", default="")
if not Token.has_extension("merged_morph"):
Token.set_extension("merged_morph", default="")
if not Token.has_extension("merged_spaceafter"):
Token.set_extension("merged_spaceafter", default="")
words, spaces, tags, poses, morphs, lemmas = [], [], [], [], [], []
heads, deps = [], []
subtok_word = ""
in_subtok = False
for i in range(len(lines)):
line = lines[i]
subtok_lines = []
parts = line.split("\t")
id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
if "." in id_:
continue
if "-" in id_:
in_subtok = True
if "-" in id_:
in_subtok = True
subtok_word = word
subtok_start, subtok_end = id_.split("-")
subtok_spaceafter = "SpaceAfter=No" not in misc
continue
if merge_subtokens and in_subtok:
words.append(subtok_word)
else:
words.append(word)
if in_subtok:
if id_ == subtok_end:
spaces.append(subtok_spaceafter)
else:
spaces.append(False)
elif "SpaceAfter=No" in misc:
spaces.append(False)
else:
spaces.append(True)
if in_subtok and id_ == subtok_end:
subtok_word = ""
in_subtok = False
id_ = int(id_) - 1
head = (int(head) - 1) if head != "0" else id_
tag = pos if tag == "_" else tag
morph = morph if morph != "_" else ""
dep = "ROOT" if dep == "root" else dep
lemmas.append(lemma)
poses.append(pos)
tags.append(tag)
morphs.append(morph)
heads.append(head)
deps.append(dep)
doc = Doc(vocab, words=words, spaces=spaces)
for i in range(len(doc)):
doc[i].tag_ = tags[i]
doc[i].pos_ = poses[i]
doc[i].dep_ = deps[i]
doc[i].lemma_ = lemmas[i]
doc[i].head = doc[heads[i]]
doc[i]._.merged_orth = words[i]
doc[i]._.merged_morph = morphs[i]
doc[i]._.merged_lemma = lemmas[i]
doc[i]._.merged_spaceafter = spaces[i]
ents = get_entities(lines, ner_tag_pattern, ner_map)
doc.ents = spans_from_biluo_tags(doc, ents)
doc.is_parsed = True
doc.is_tagged = True
if merge_subtokens:
doc = merge_conllu_subtokens(lines, doc)
# create Example from custom Doc annotation
ids, words, tags, heads, deps = [], [], [], [], []
pos, lemmas, morphs, spaces = [], [], [], []
for i, t in enumerate(doc):
ids.append(i)
words.append(t._.merged_orth)
if append_morphology and t._.merged_morph:
tags.append(t.tag_ + "__" + t._.merged_morph)
else:
tags.append(t.tag_)
pos.append(t.pos_)
morphs.append(t._.merged_morph)
lemmas.append(t._.merged_lemma)
heads.append(t.head.i)
deps.append(t.dep_)
spaces.append(t._.merged_spaceafter)
ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
ents = biluo_tags_from_offsets(doc, ent_offsets)
raw = ""
for word, space in zip(words, spaces):
raw += word
if space:
raw += " "
example = Example(doc=raw)
example.set_token_annotation(ids=ids, words=words, tags=tags, pos=pos,
morphs=morphs, lemmas=lemmas, heads=heads,
deps=deps, entities=ents)
return example
def merge_conllu_subtokens(lines, doc):
# identify and process all subtoken spans to prepare attrs for merging
subtok_spans = []
for line in lines:
parts = line.split("\t")
id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
if "-" in id_:
subtok_start, subtok_end = id_.split("-")
subtok_span = doc[int(subtok_start) - 1:int(subtok_end)]
subtok_spans.append(subtok_span)
# create merged tag, morph, and lemma values
tags = []
morphs = {}
lemmas = []
for token in subtok_span:
tags.append(token.tag_)
lemmas.append(token.lemma_)
if token._.merged_morph:
for feature in token._.merged_morph.split("|"):
field, values = feature.split("=", 1)
if not field in morphs:
morphs[field] = set()
for value in values.split(","):
morphs[field].add(value)
# create merged features for each morph field
for field, values in morphs.items():
morphs[field] = field + "=" + ",".join(sorted(values))
# set the same attrs on all subtok tokens so that whatever head the
# retokenizer chooses, the final attrs are available on that token
for token in subtok_span:
token._.merged_orth = token.orth_
token._.merged_lemma = " ".join(lemmas)
token.tag_ = "_".join(tags)
token._.merged_morph = "|".join(sorted(morphs.values()))
token._.merged_spaceafter = True if subtok_span[-1].whitespace_ else False
with doc.retokenize() as retokenizer:
for span in subtok_spans:
retokenizer.merge(span)
return doc

View File

@ -54,6 +54,43 @@ def test_cli_converters_conllu2json_name_ner_map():
assert [t["ner"] for t in tokens] == ["O", "B-PERSON", "L-PERSON", "O", "O"]
def test_cli_converters_conllu2json_subtokens():
# https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu
lines = [
"1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O",
"2-3\tFE\t_\t_\t_\t_\t_\t_\t_\t_",
"2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tname=B-PER",
"3\tEilertsen\tEilertsen\tX\t_\tGender=Fem|Tense=past\t2\tname\t_\tname=I-PER",
"4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O",
"5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O",
]
input_data = "\n".join(lines)
converted = conllu2json(input_data, n_sents=1, merge_subtokens=True,
append_morphology=True)
assert len(converted) == 1
assert converted[0]["id"] == 0
assert len(converted[0]["paragraphs"]) == 1
assert converted[0]["paragraphs"][0]["raw"] == "Dommer FE avstår."
assert len(converted[0]["paragraphs"][0]["sentences"]) == 1
sent = converted[0]["paragraphs"][0]["sentences"][0]
assert len(sent["tokens"]) == 4
tokens = sent["tokens"]
print(tokens)
assert [t["orth"] for t in tokens] == ["Dommer", "FE", "avstår", "."]
assert [t["tag"] for t in tokens] == [
"NOUN__Definite=Ind|Gender=Masc|Number=Sing",
"PROPN_X__Gender=Fem,Masc|Tense=past",
"VERB__Mood=Ind|Tense=Pres|VerbForm=Fin",
"PUNCT"
]
assert [t["pos"] for t in tokens] == ['NOUN', 'PROPN', 'VERB', 'PUNCT']
assert [t["morph"] for t in tokens] == ['Definite=Ind|Gender=Masc|Number=Sing', 'Gender=Fem,Masc|Tense=past', 'Mood=Ind|Tense=Pres|VerbForm=Fin', '']
assert [t["lemma"] for t in tokens] == ['dommer', 'Finn Eilertsen', 'avstå', '$.']
assert [t["head"] for t in tokens] == [1, 1, 0, -1]
assert [t["dep"] for t in tokens] == ["appos", "nsubj", "ROOT", "punct"]
assert [t["ner"] for t in tokens] == ["O", "U-PER", "O", "O"]
def test_cli_converters_iob2json():
lines = [
"I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",