Add convert CLI option to merge CoNLL-U subtokens (#4722)

* Add convert CLI option to merge CoNLL-U subtokens

Add `-T` option to convert CLI that merges CoNLL-U subtokens into one
token in the converted data. Each CoNLL-U sentence is read into a `Doc`
and the `Retokenizer` is used to merge subtokens with features as
follows:

* `orth` is the merged token orth (should correspond to raw text and `#
text`)

* `tag` is all subtoken tags concatenated with `_`, e.g. `ADP_DET`

* `pos` is the POS of the syntactic root of the span (as determined by
the Retokenizer)

* `morph` is all morphological features merged

* `lemma` is all subtoken lemmas concatenated with ` `, e.g. `de o`

* with `-m` all morphological features are combined with the tag using
the separator `__`, e.g.
`ADP_DET__Definite=Def|Gender=Masc|Number=Sing|PronType=Art`

* `dep` is the dependency relation for the syntactic root of the span
(as determined by the Retokenizer)

Concatenated tags will be mapped to the UD POS of the syntactic root
(e.g., `ADP`) and the morphological features will be the combined
features.

In many cases, the original UD subtokens can be reconstructed from the
available features given a language-specific lookup table, e.g.,
Portuguese `do / ADP_DET /
Definite=Def|Gender=Masc|Number=Sing|PronType=Art` is `de / ADP`, `o /
DET / Definite=Def|Gender=Masc|Number=Sing|PronType=Art` or lookup rules
for forms containing open class words like Spanish `hablarlo / VERB_PRON
/
Case=Acc|Gender=Masc|Number=Sing|Person=3|PrepCase=Npr|PronType=Prs|VerbForm=Inf`.

* Clean up imports
This commit is contained in:
adrianeboyd 2020-01-29 17:44:25 +01:00 committed by GitHub
parent 569cc98982
commit a365359b36
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 273 additions and 98 deletions

View File

@ -34,6 +34,7 @@ def convert(
seg_sents: ("Segment sentences (for -c ner)", "flag", "s") = False, seg_sents: ("Segment sentences (for -c ner)", "flag", "s") = False,
model: ("Model for sentence segmentation (for -s)", "option", "b", str) = None, model: ("Model for sentence segmentation (for -s)", "option", "b", str) = None,
morphology: ("Enable appending morphology to tags", "flag", "m", bool) = False, morphology: ("Enable appending morphology to tags", "flag", "m", bool) = False,
merge_subtokens: ("Merge CoNLL-U subtokens", "flag", "T", bool) = False,
converter: (f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str) = "auto", converter: (f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str) = "auto",
ner_map_path: ("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path) = None, ner_map_path: ("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path) = None,
lang: ("Language (if tokenizer required)", "option", "l", str) = None, lang: ("Language (if tokenizer required)", "option", "l", str) = None,
@ -85,7 +86,8 @@ def convert(
input_data, input_data,
n_sents=n_sents, n_sents=n_sents,
seg_sents=seg_sents, seg_sents=seg_sents,
use_morphology=morphology, append_morphology=morphology,
merge_subtokens=merge_subtokens,
lang=lang, lang=lang,
model=model, model=model,
no_print=no_print, no_print=no_print,

View File

@ -1,36 +1,36 @@
import re import re
from spacy.gold import Example from ...gold import Example
from ...gold import iob_to_biluo from ...gold import iob_to_biluo, spans_from_biluo_tags, biluo_tags_from_offsets
from ...language import Language
from ...tokens import Doc, Token
from .conll_ner2json import n_sents_info
from wasabi import Printer
def conllu2json( def conllu2json(
input_data, n_sents=10, use_morphology=False, lang=None, ner_map=None, **_ input_data, n_sents=10, append_morphology=False, lang=None, ner_map=None,
merge_subtokens=False, no_print=False, **_
): ):
""" """
Convert conllu files into JSON format for use with train cli. Convert conllu files into JSON format for use with train cli.
use_morphology parameter enables appending morphology to tags, which is append_morphology parameter enables appending morphology to tags, which is
useful for languages such as Spanish, where UD tags are not so rich. useful for languages such as Spanish, where UD tags are not so rich.
Extract NER tags if available and convert them so that they follow Extract NER tags if available and convert them so that they follow
BILUO and the Wikipedia scheme BILUO and the Wikipedia scheme
""" """
# by @dvsrepo, via #11 explosion/spacy-dev-resources
# by @katarkor
# name=NER is to handle NorNE
MISC_NER_PATTERN = "\|?(?:name=)?(([A-Z_]+)-([A-Z_]+)|O)\|?" MISC_NER_PATTERN = "\|?(?:name=)?(([A-Z_]+)-([A-Z_]+)|O)\|?"
msg = Printer(no_print=no_print)
n_sents_info(msg, n_sents)
docs = [] docs = []
raw = "" raw = ""
sentences = [] sentences = []
conll_data = read_conllx(input_data, use_morphology=use_morphology) conll_data = read_conllx(input_data, append_morphology=append_morphology,
checked_for_ner = False ner_tag_pattern=MISC_NER_PATTERN, ner_map=ner_map,
has_ner_tags = False merge_subtokens=merge_subtokens)
has_ner_tags = has_ner(input_data, ner_tag_pattern=MISC_NER_PATTERN)
for i, example in enumerate(conll_data): for i, example in enumerate(conll_data):
if not checked_for_ner:
has_ner_tags = is_ner(
example.token_annotation.entities[0], MISC_NER_PATTERN
)
checked_for_ner = True
raw += example.text raw += example.text
sentences.append( sentences.append(
generate_sentence( generate_sentence(
@ -43,137 +43,273 @@ def conllu2json(
# Real-sized documents could be extracted using the comments on the # Real-sized documents could be extracted using the comments on the
# conllu document # conllu document
if len(sentences) % n_sents == 0: if len(sentences) % n_sents == 0:
doc = create_doc(raw, sentences, i) doc = create_json_doc(raw, sentences, i)
docs.append(doc) docs.append(doc)
raw = "" raw = ""
sentences = [] sentences = []
if sentences: if sentences:
doc = create_doc(raw, sentences, i) doc = create_json_doc(raw, sentences, i)
docs.append(doc) docs.append(doc)
return docs return docs
def is_ner(tag, tag_pattern): def has_ner(input_data, ner_tag_pattern):
""" """
Check the 10th column of the first token to determine if the file contains Check the 10th column of the first token to determine if the file contains
NER tags NER tags
""" """
tag_match = re.search(tag_pattern, tag) for sent in input_data.strip().split("\n\n"):
if tag_match: lines = sent.strip().split("\n")
return True if lines:
elif tag == "O": while lines[0].startswith("#"):
lines.pop(0)
if lines:
parts = lines[0].split("\t")
id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
if re.search(ner_tag_pattern, misc):
return True return True
else: else:
return False return False
def read_conllx(input_data, use_morphology=False, n=0): def read_conllx(input_data, append_morphology=False, merge_subtokens=False,
""" Yield example data points, one for each sentence """ ner_tag_pattern="", ner_map=None):
""" Yield examples, one for each sentence """
vocab = Language.Defaults.create_vocab() # need vocab to make a minimal Doc
i = 0 i = 0
for sent in input_data.strip().split("\n\n"): for sent in input_data.strip().split("\n\n"):
lines = sent.strip().split("\n") lines = sent.strip().split("\n")
if lines: if lines:
while lines[0].startswith("#"): while lines[0].startswith("#"):
lines.pop(0) lines.pop(0)
ids, words, tags, heads, deps, ents = [], [], [], [], [], [] example = example_from_conllu_sentence(vocab, lines,
spaces = [] ner_tag_pattern, merge_subtokens=merge_subtokens,
append_morphology=append_morphology,
ner_map=ner_map)
yield example
def get_entities(lines, tag_pattern, ner_map=None):
"""Find entities in the MISC column according to the pattern and map to
final entity type with `ner_map` if mapping present. Entity tag is 'O' if
the pattern is not matched.
lines (unicode): CONLL-U lines for one sentences
tag_pattern (unicode): Regex pattern for entity tag
ner_map (dict): Map old NER tag names to new ones, '' maps to O.
RETURNS (list): List of BILUO entity tags
"""
miscs = []
for line in lines: for line in lines:
parts = line.split("\t") parts = line.split("\t")
id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
if "-" in id_ or "." in id_: if "-" in id_ or "." in id_:
continue continue
try: miscs.append(misc)
id_ = int(id_) - 1
head = (int(head) - 1) if head != "0" else id_
dep = "ROOT" if dep == "root" else dep
tag = pos if tag == "_" else tag
tag = tag + "__" + morph if use_morphology else tag
ent = misc if misc else "O"
ids.append(id_) iob = []
for misc in miscs:
tag_match = re.search(tag_pattern, misc)
iob_tag = "O"
if tag_match:
prefix = tag_match.group(2)
suffix = tag_match.group(3)
if prefix and suffix:
iob_tag = prefix + "-" + suffix
if ner_map:
suffix = ner_map.get(suffix, suffix)
if suffix == "":
iob_tag = "O"
else:
iob_tag = prefix + "-" + suffix
iob.append(iob_tag)
return iob_to_biluo(iob)
def generate_sentence(token_annotation, has_ner_tags, tag_pattern, ner_map=None):
sentence = {}
tokens = []
for i, id_ in enumerate(token_annotation.ids):
token = {}
token["id"] = id_
token["orth"] = token_annotation.get_word(i)
token["tag"] = token_annotation.get_tag(i)
token["pos"] = token_annotation.get_pos(i)
token["lemma"] = token_annotation.get_lemma(i)
token["morph"] = token_annotation.get_morph(i)
token["head"] = token_annotation.get_head(i) - id_
token["dep"] = token_annotation.get_dep(i)
if has_ner_tags:
token["ner"] = token_annotation.get_entity(i)
tokens.append(token)
sentence["tokens"] = tokens
return sentence
def create_json_doc(raw, sentences, id_):
doc = {}
paragraph = {}
doc["id"] = id_
doc["paragraphs"] = []
paragraph["raw"] = raw.strip()
paragraph["sentences"] = sentences
doc["paragraphs"].append(paragraph)
return doc
def example_from_conllu_sentence(vocab, lines, ner_tag_pattern,
merge_subtokens=False, append_morphology=False, ner_map=None):
"""Create an Example from the lines for one CoNLL-U sentence, merging
subtokens and appending morphology to tags if required.
lines (unicode): The non-comment lines for a CoNLL-U sentence
ner_tag_pattern (unicode): The regex pattern for matching NER in MISC col
RETURNS (Example): An example containing the annotation
"""
# create a Doc with each subtoken as its own token
# if merging subtokens, each subtoken orth is the merged subtoken form
if not Token.has_extension("merged_orth"):
Token.set_extension("merged_orth", default="")
if not Token.has_extension("merged_lemma"):
Token.set_extension("merged_lemma", default="")
if not Token.has_extension("merged_morph"):
Token.set_extension("merged_morph", default="")
if not Token.has_extension("merged_spaceafter"):
Token.set_extension("merged_spaceafter", default="")
words, spaces, tags, poses, morphs, lemmas = [], [], [], [], [], []
heads, deps = [], []
subtok_word = ""
in_subtok = False
for i in range(len(lines)):
line = lines[i]
subtok_lines = []
parts = line.split("\t")
id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
if "." in id_:
continue
if "-" in id_:
in_subtok = True
if "-" in id_:
in_subtok = True
subtok_word = word
subtok_start, subtok_end = id_.split("-")
subtok_spaceafter = "SpaceAfter=No" not in misc
continue
if merge_subtokens and in_subtok:
words.append(subtok_word)
else:
words.append(word) words.append(word)
tags.append(tag) if in_subtok:
heads.append(head) if id_ == subtok_end:
deps.append(dep) spaces.append(subtok_spaceafter)
ents.append(ent) else:
if "SpaceAfter=No" in misc: spaces.append(False)
elif "SpaceAfter=No" in misc:
spaces.append(False) spaces.append(False)
else: else:
spaces.append(True) spaces.append(True)
except: # noqa: E722 if in_subtok and id_ == subtok_end:
print(line) subtok_word = ""
raise in_subtok = False
id_ = int(id_) - 1
head = (int(head) - 1) if head != "0" else id_
tag = pos if tag == "_" else tag
morph = morph if morph != "_" else ""
dep = "ROOT" if dep == "root" else dep
lemmas.append(lemma)
poses.append(pos)
tags.append(tag)
morphs.append(morph)
heads.append(head)
deps.append(dep)
doc = Doc(vocab, words=words, spaces=spaces)
for i in range(len(doc)):
doc[i].tag_ = tags[i]
doc[i].pos_ = poses[i]
doc[i].dep_ = deps[i]
doc[i].lemma_ = lemmas[i]
doc[i].head = doc[heads[i]]
doc[i]._.merged_orth = words[i]
doc[i]._.merged_morph = morphs[i]
doc[i]._.merged_lemma = lemmas[i]
doc[i]._.merged_spaceafter = spaces[i]
ents = get_entities(lines, ner_tag_pattern, ner_map)
doc.ents = spans_from_biluo_tags(doc, ents)
doc.is_parsed = True
doc.is_tagged = True
if merge_subtokens:
doc = merge_conllu_subtokens(lines, doc)
# create Example from custom Doc annotation
ids, words, tags, heads, deps = [], [], [], [], []
pos, lemmas, morphs, spaces = [], [], [], []
for i, t in enumerate(doc):
ids.append(i)
words.append(t._.merged_orth)
if append_morphology and t._.merged_morph:
tags.append(t.tag_ + "__" + t._.merged_morph)
else:
tags.append(t.tag_)
pos.append(t.pos_)
morphs.append(t._.merged_morph)
lemmas.append(t._.merged_lemma)
heads.append(t.head.i)
deps.append(t.dep_)
spaces.append(t._.merged_spaceafter)
ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
ents = biluo_tags_from_offsets(doc, ent_offsets)
raw = "" raw = ""
for word, space in zip(words, spaces): for word, space in zip(words, spaces):
raw += word raw += word
if space: if space:
raw += " " raw += " "
example = Example(doc=raw) example = Example(doc=raw)
example.set_token_annotation( example.set_token_annotation(ids=ids, words=words, tags=tags, pos=pos,
ids=ids, words=words, tags=tags, heads=heads, deps=deps, entities=ents morphs=morphs, lemmas=lemmas, heads=heads,
) deps=deps, entities=ents)
yield example return example
i += 1
if 1 <= n <= i:
break
def extract_tags(iob, tag_pattern, ner_map=None): def merge_conllu_subtokens(lines, doc):
""" # identify and process all subtoken spans to prepare attrs for merging
Extract tag from MISC column according to `tag_pattern` and map to final subtok_spans = []
entity type with `ner_map` if mapping present. for line in lines:
parts = line.split("\t")
id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
if "-" in id_:
subtok_start, subtok_end = id_.split("-")
subtok_span = doc[int(subtok_start) - 1:int(subtok_end)]
subtok_spans.append(subtok_span)
# create merged tag, morph, and lemma values
tags = []
morphs = {}
lemmas = []
for token in subtok_span:
tags.append(token.tag_)
lemmas.append(token.lemma_)
if token._.merged_morph:
for feature in token._.merged_morph.split("|"):
field, values = feature.split("=", 1)
if not field in morphs:
morphs[field] = set()
for value in values.split(","):
morphs[field].add(value)
# create merged features for each morph field
for field, values in morphs.items():
morphs[field] = field + "=" + ",".join(sorted(values))
# set the same attrs on all subtok tokens so that whatever head the
# retokenizer chooses, the final attrs are available on that token
for token in subtok_span:
token._.merged_orth = token.orth_
token._.merged_lemma = " ".join(lemmas)
token.tag_ = "_".join(tags)
token._.merged_morph = "|".join(sorted(morphs.values()))
token._.merged_spaceafter = True if subtok_span[-1].whitespace_ else False
For NorNE: with doc.retokenize() as retokenizer:
Simplify tags obtained from the dataset in order to follow Wikipedia for span in subtok_spans:
scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while retokenizer.merge(span)
'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to
'MISC'.
"""
new_iob = []
for tag in iob:
tag_match = re.search(tag_pattern, tag)
new_tag = "O"
if tag_match:
prefix = tag_match.group(2)
suffix = tag_match.group(3)
if prefix and suffix:
new_tag = prefix + "-" + suffix
if ner_map:
suffix = ner_map.get(suffix, suffix)
if suffix == "":
new_tag = "O"
else:
new_tag = prefix + "-" + suffix
new_iob.append(new_tag)
return new_iob
def generate_sentence(token_annotation, has_ner_tags, tag_pattern, ner_map=None):
sentence = {}
tokens = []
if has_ner_tags:
iob = extract_tags(token_annotation.entities, tag_pattern, ner_map=ner_map)
biluo = iob_to_biluo(iob)
for i, id in enumerate(token_annotation.ids):
token = {}
token["id"] = id
token["orth"] = token_annotation.words[i]
token["tag"] = token_annotation.tags[i]
token["head"] = token_annotation.heads[i] - id
token["dep"] = token_annotation.deps[i]
if has_ner_tags:
token["ner"] = biluo[i]
tokens.append(token)
sentence["tokens"] = tokens
return sentence
def create_doc(raw, sentences, id):
doc = {}
paragraph = {}
doc["id"] = id
doc["paragraphs"] = []
paragraph["raw"] = raw.strip()
paragraph["sentences"] = sentences
doc["paragraphs"].append(paragraph)
return doc return doc

View File

@ -54,6 +54,43 @@ def test_cli_converters_conllu2json_name_ner_map():
assert [t["ner"] for t in tokens] == ["O", "B-PERSON", "L-PERSON", "O", "O"] assert [t["ner"] for t in tokens] == ["O", "B-PERSON", "L-PERSON", "O", "O"]
def test_cli_converters_conllu2json_subtokens():
# https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu
lines = [
"1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O",
"2-3\tFE\t_\t_\t_\t_\t_\t_\t_\t_",
"2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tname=B-PER",
"3\tEilertsen\tEilertsen\tX\t_\tGender=Fem|Tense=past\t2\tname\t_\tname=I-PER",
"4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O",
"5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O",
]
input_data = "\n".join(lines)
converted = conllu2json(input_data, n_sents=1, merge_subtokens=True,
append_morphology=True)
assert len(converted) == 1
assert converted[0]["id"] == 0
assert len(converted[0]["paragraphs"]) == 1
assert converted[0]["paragraphs"][0]["raw"] == "Dommer FE avstår."
assert len(converted[0]["paragraphs"][0]["sentences"]) == 1
sent = converted[0]["paragraphs"][0]["sentences"][0]
assert len(sent["tokens"]) == 4
tokens = sent["tokens"]
print(tokens)
assert [t["orth"] for t in tokens] == ["Dommer", "FE", "avstår", "."]
assert [t["tag"] for t in tokens] == [
"NOUN__Definite=Ind|Gender=Masc|Number=Sing",
"PROPN_X__Gender=Fem,Masc|Tense=past",
"VERB__Mood=Ind|Tense=Pres|VerbForm=Fin",
"PUNCT"
]
assert [t["pos"] for t in tokens] == ['NOUN', 'PROPN', 'VERB', 'PUNCT']
assert [t["morph"] for t in tokens] == ['Definite=Ind|Gender=Masc|Number=Sing', 'Gender=Fem,Masc|Tense=past', 'Mood=Ind|Tense=Pres|VerbForm=Fin', '']
assert [t["lemma"] for t in tokens] == ['dommer', 'Finn Eilertsen', 'avstå', '$.']
assert [t["head"] for t in tokens] == [1, 1, 0, -1]
assert [t["dep"] for t in tokens] == ["appos", "nsubj", "ROOT", "punct"]
assert [t["ner"] for t in tokens] == ["O", "U-PER", "O", "O"]
def test_cli_converters_iob2json(): def test_cli_converters_iob2json():
lines = [ lines = [
"I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O", "I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",