Add convert CLI option to merge CoNLL-U subtokens (#4722)

* Add convert CLI option to merge CoNLL-U subtokens Add `-T` option to convert CLI that merges CoNLL-U subtokens into one token in the converted data. Each CoNLL-U sentence is read into a `Doc` and the `Retokenizer` is used to merge subtokens with features as follows: * `orth` is the merged token orth (should correspond to raw text and `# text`) * `tag` is all subtoken tags concatenated with `_`, e.g. `ADP_DET` * `pos` is the POS of the syntactic root of the span (as determined by the Retokenizer) * `morph` is all morphological features merged * `lemma` is all subtoken lemmas concatenated with ` `, e.g. `de o` * with `-m` all morphological features are combined with the tag using the separator `__`, e.g. `ADP_DET__Definite=Def|Gender=Masc|Number=Sing|PronType=Art` * `dep` is the dependency relation for the syntactic root of the span (as determined by the Retokenizer) Concatenated tags will be mapped to the UD POS of the syntactic root (e.g., `ADP`) and the morphological features will be the combined features. In many cases, the original UD subtokens can be reconstructed from the available features given a language-specific lookup table, e.g., Portuguese `do / ADP_DET / Definite=Def|Gender=Masc|Number=Sing|PronType=Art` is `de / ADP`, `o / DET / Definite=Def|Gender=Masc|Number=Sing|PronType=Art` or lookup rules for forms containing open class words like Spanish `hablarlo / VERB_PRON / Case=Acc|Gender=Masc|Number=Sing|Person=3|PrepCase=Npr|PronType=Prs|VerbForm=Inf`. * Clean up imports
2025-11-23 11:16:01 +03:00 · 2020-01-29 17:44:25 +01:00 · 2020-01-29 17:44:25 +01:00 · a365359b36
commit a365359b36
parent 569cc98982
3 changed files with 273 additions and 98 deletions
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -34,6 +34,7 @@ def convert(
    seg_sents: ("Segment sentences (for -c ner)", "flag", "s") = False,
    model: ("Model for sentence segmentation (for -s)", "option", "b", str) = None,
    morphology: ("Enable appending morphology to tags", "flag", "m", bool) = False,
+    merge_subtokens: ("Merge CoNLL-U subtokens", "flag", "T", bool) = False,
    converter: (f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str) = "auto",
    ner_map_path: ("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path) = None,
    lang: ("Language (if tokenizer required)", "option", "l", str) = None,
@ -85,7 +86,8 @@ def convert(
        input_data,
        n_sents=n_sents,
        seg_sents=seg_sents,
-        use_morphology=morphology,
+        append_morphology=morphology,
+        merge_subtokens=merge_subtokens,
        lang=lang,
        model=model,
        no_print=no_print,
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@ -1,36 +1,36 @@
 import re

-from spacy.gold import Example
-from ...gold import iob_to_biluo
+from ...gold import Example
+from ...gold import iob_to_biluo, spans_from_biluo_tags, biluo_tags_from_offsets
+from ...language import Language
+from ...tokens import Doc, Token
+from .conll_ner2json import n_sents_info
+from wasabi import Printer


 def conllu2json(
-    input_data, n_sents=10, use_morphology=False, lang=None, ner_map=None, **_
+    input_data, n_sents=10, append_morphology=False, lang=None, ner_map=None,
+    merge_subtokens=False, no_print=False, **_
 ):
    """
    Convert conllu files into JSON format for use with train cli.
-    use_morphology parameter enables appending morphology to tags, which is
+    append_morphology parameter enables appending morphology to tags, which is
    useful for languages such as Spanish, where UD tags are not so rich.

    Extract NER tags if available and convert them so that they follow
    BILUO and the Wikipedia scheme
    """
-    # by @dvsrepo, via #11 explosion/spacy-dev-resources
-    # by @katarkor
-    # name=NER is to handle NorNE
    MISC_NER_PATTERN = "\|?(?:name=)?(([A-Z_]+)-([A-Z_]+)|O)\|?"
+    msg = Printer(no_print=no_print)
+    n_sents_info(msg, n_sents)
    docs = []
    raw = ""
    sentences = []
-    conll_data = read_conllx(input_data, use_morphology=use_morphology)
-    checked_for_ner = False
-    has_ner_tags = False
+    conll_data = read_conllx(input_data, append_morphology=append_morphology,
+                             ner_tag_pattern=MISC_NER_PATTERN, ner_map=ner_map,
+                             merge_subtokens=merge_subtokens)
+    has_ner_tags = has_ner(input_data, ner_tag_pattern=MISC_NER_PATTERN)
    for i, example in enumerate(conll_data):
-        if not checked_for_ner:
-            has_ner_tags = is_ner(
-                example.token_annotation.entities[0], MISC_NER_PATTERN
-            )
-            checked_for_ner = True
        raw += example.text
        sentences.append(
            generate_sentence(
@ -43,137 +43,273 @@ def conllu2json(
        # Real-sized documents could be extracted using the comments on the
        # conllu document
        if len(sentences) % n_sents == 0:
-            doc = create_doc(raw, sentences, i)
+            doc = create_json_doc(raw, sentences, i)
            docs.append(doc)
            raw = ""
            sentences = []
    if sentences:
-        doc = create_doc(raw, sentences, i)
+        doc = create_json_doc(raw, sentences, i)
        docs.append(doc)
    return docs


-def is_ner(tag, tag_pattern):
+def has_ner(input_data, ner_tag_pattern):
    """
    Check the 10th column of the first token to determine if the file contains
    NER tags
    """
-    tag_match = re.search(tag_pattern, tag)
-    if tag_match:
-        return True
-    elif tag == "O":
-        return True
-    else:
-        return False
+    for sent in input_data.strip().split("\n\n"):
+        lines = sent.strip().split("\n")
+        if lines:
+            while lines[0].startswith("#"):
+                lines.pop(0)
+            if lines:
+                parts = lines[0].split("\t")
+                id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
+                if re.search(ner_tag_pattern, misc):
+                    return True
+                else:
+                    return False


-def read_conllx(input_data, use_morphology=False, n=0):
-    """ Yield example data points, one for each sentence """
+def read_conllx(input_data, append_morphology=False, merge_subtokens=False,
+            ner_tag_pattern="", ner_map=None):
+    """ Yield examples, one for each sentence """
+    vocab = Language.Defaults.create_vocab() # need vocab to make a minimal Doc
    i = 0
    for sent in input_data.strip().split("\n\n"):
        lines = sent.strip().split("\n")
        if lines:
            while lines[0].startswith("#"):
                lines.pop(0)
-            ids, words, tags, heads, deps, ents = [], [], [], [], [], []
-            spaces = []
-            for line in lines:
-                parts = line.split("\t")
-                id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
-                if "-" in id_ or "." in id_:
-                    continue
-                try:
-                    id_ = int(id_) - 1
-                    head = (int(head) - 1) if head != "0" else id_
-                    dep = "ROOT" if dep == "root" else dep
-                    tag = pos if tag == "_" else tag
-                    tag = tag + "__" + morph if use_morphology else tag
-                    ent = misc if misc else "O"
-
-                    ids.append(id_)
-                    words.append(word)
-                    tags.append(tag)
-                    heads.append(head)
-                    deps.append(dep)
-                    ents.append(ent)
-                    if "SpaceAfter=No" in misc:
-                        spaces.append(False)
-                    else:
-                        spaces.append(True)
-                except:  # noqa: E722
-                    print(line)
-                    raise
-            raw = ""
-            for word, space in zip(words, spaces):
-                raw += word
-                if space:
-                    raw += " "
-            example = Example(doc=raw)
-            example.set_token_annotation(
-                ids=ids, words=words, tags=tags, heads=heads, deps=deps, entities=ents
-            )
+            example = example_from_conllu_sentence(vocab, lines,
+                    ner_tag_pattern, merge_subtokens=merge_subtokens,
+                    append_morphology=append_morphology,
+                    ner_map=ner_map)
            yield example
-            i += 1
-            if 1 <= n <= i:
-                break


-def extract_tags(iob, tag_pattern, ner_map=None):
+def get_entities(lines, tag_pattern, ner_map=None):
+    """Find entities in the MISC column according to the pattern and map to
+    final entity type with `ner_map` if mapping present. Entity tag is 'O' if
+    the pattern is not matched.
+
+    lines (unicode): CONLL-U lines for one sentences
+    tag_pattern (unicode): Regex pattern for entity tag
+    ner_map (dict): Map old NER tag names to new ones, '' maps to O.
+    RETURNS (list): List of BILUO entity tags
    """
-    Extract tag from MISC column according to `tag_pattern` and map to final
-    entity type with `ner_map` if mapping present.
+    miscs = []
+    for line in lines:
+        parts = line.split("\t")
+        id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
+        if "-" in id_ or "." in id_:
+            continue
+        miscs.append(misc)

-    For NorNE:
-    Simplify tags obtained from the dataset in order to follow Wikipedia
-    scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while
-    'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to
-    'MISC'.
-    """
-    new_iob = []
-    for tag in iob:
-        tag_match = re.search(tag_pattern, tag)
-        new_tag = "O"
+    iob = []
+    for misc in miscs:
+        tag_match = re.search(tag_pattern, misc)
+        iob_tag = "O"
        if tag_match:
            prefix = tag_match.group(2)
            suffix = tag_match.group(3)
            if prefix and suffix:
-                new_tag = prefix + "-" + suffix
+                iob_tag = prefix + "-" + suffix
                if ner_map:
                    suffix = ner_map.get(suffix, suffix)
                    if suffix == "":
-                        new_tag = "O"
+                        iob_tag = "O"
                    else:
-                        new_tag = prefix + "-" + suffix
-        new_iob.append(new_tag)
-    return new_iob
+                        iob_tag = prefix + "-" + suffix
+        iob.append(iob_tag)
+    return iob_to_biluo(iob)


 def generate_sentence(token_annotation, has_ner_tags, tag_pattern, ner_map=None):
    sentence = {}
    tokens = []
-    if has_ner_tags:
-        iob = extract_tags(token_annotation.entities, tag_pattern, ner_map=ner_map)
-        biluo = iob_to_biluo(iob)
-    for i, id in enumerate(token_annotation.ids):
+    for i, id_ in enumerate(token_annotation.ids):
        token = {}
-        token["id"] = id
-        token["orth"] = token_annotation.words[i]
-        token["tag"] = token_annotation.tags[i]
-        token["head"] = token_annotation.heads[i] - id
-        token["dep"] = token_annotation.deps[i]
+        token["id"] = id_
+        token["orth"] = token_annotation.get_word(i)
+        token["tag"] = token_annotation.get_tag(i)
+        token["pos"] = token_annotation.get_pos(i)
+        token["lemma"] = token_annotation.get_lemma(i)
+        token["morph"] = token_annotation.get_morph(i)
+        token["head"] = token_annotation.get_head(i) - id_
+        token["dep"] = token_annotation.get_dep(i)
        if has_ner_tags:
-            token["ner"] = biluo[i]
+            token["ner"] = token_annotation.get_entity(i)
        tokens.append(token)
    sentence["tokens"] = tokens
    return sentence


-def create_doc(raw, sentences, id):
+def create_json_doc(raw, sentences, id_):
    doc = {}
    paragraph = {}
-    doc["id"] = id
+    doc["id"] = id_
    doc["paragraphs"] = []
    paragraph["raw"] = raw.strip()
    paragraph["sentences"] = sentences
    doc["paragraphs"].append(paragraph)
    return doc
+
+
+def example_from_conllu_sentence(vocab, lines, ner_tag_pattern,
+        merge_subtokens=False, append_morphology=False, ner_map=None):
+    """Create an Example from the lines for one CoNLL-U sentence, merging
+    subtokens and appending morphology to tags if required.
+
+    lines (unicode): The non-comment lines for a CoNLL-U sentence
+    ner_tag_pattern (unicode): The regex pattern for matching NER in MISC col
+    RETURNS (Example): An example containing the annotation
+    """
+    # create a Doc with each subtoken as its own token
+    # if merging subtokens, each subtoken orth is the merged subtoken form
+    if not Token.has_extension("merged_orth"):
+        Token.set_extension("merged_orth", default="")
+    if not Token.has_extension("merged_lemma"):
+        Token.set_extension("merged_lemma", default="")
+    if not Token.has_extension("merged_morph"):
+        Token.set_extension("merged_morph", default="")
+    if not Token.has_extension("merged_spaceafter"):
+        Token.set_extension("merged_spaceafter", default="")
+    words, spaces, tags, poses, morphs, lemmas = [], [], [], [], [], []
+    heads, deps = [], []
+    subtok_word = ""
+    in_subtok = False
+    for i in range(len(lines)):
+        line = lines[i]
+        subtok_lines = []
+        parts = line.split("\t")
+        id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
+        if "." in id_:
+            continue
+        if "-" in id_:
+            in_subtok = True
+        if "-" in id_:
+            in_subtok = True
+            subtok_word = word
+            subtok_start, subtok_end = id_.split("-")
+            subtok_spaceafter = "SpaceAfter=No" not in misc
+            continue
+        if merge_subtokens and in_subtok:
+            words.append(subtok_word)
+        else:
+            words.append(word)
+        if in_subtok:
+            if id_ == subtok_end:
+                spaces.append(subtok_spaceafter)
+            else:
+                spaces.append(False)
+        elif "SpaceAfter=No" in misc:
+            spaces.append(False)
+        else:
+            spaces.append(True)
+        if in_subtok and id_ == subtok_end:
+            subtok_word = ""
+            in_subtok = False
+        id_ = int(id_) - 1
+        head = (int(head) - 1) if head != "0" else id_
+        tag = pos if tag == "_" else tag
+        morph = morph if morph != "_" else ""
+        dep = "ROOT" if dep == "root" else dep
+        lemmas.append(lemma)
+        poses.append(pos)
+        tags.append(tag)
+        morphs.append(morph)
+        heads.append(head)
+        deps.append(dep)
+
+    doc = Doc(vocab, words=words, spaces=spaces)
+    for i in range(len(doc)):
+        doc[i].tag_ = tags[i]
+        doc[i].pos_ = poses[i]
+        doc[i].dep_ = deps[i]
+        doc[i].lemma_ = lemmas[i]
+        doc[i].head = doc[heads[i]]
+        doc[i]._.merged_orth = words[i]
+        doc[i]._.merged_morph = morphs[i]
+        doc[i]._.merged_lemma = lemmas[i]
+        doc[i]._.merged_spaceafter = spaces[i]
+    ents = get_entities(lines, ner_tag_pattern, ner_map)
+    doc.ents = spans_from_biluo_tags(doc, ents)
+    doc.is_parsed = True
+    doc.is_tagged = True
+
+    if merge_subtokens:
+        doc = merge_conllu_subtokens(lines, doc)
+
+    # create Example from custom Doc annotation
+    ids, words, tags, heads, deps = [], [], [], [], []
+    pos, lemmas, morphs, spaces = [], [], [], []
+    for i, t in enumerate(doc):
+        ids.append(i)
+        words.append(t._.merged_orth)
+        if append_morphology and t._.merged_morph:
+            tags.append(t.tag_ + "__" + t._.merged_morph)
+        else:
+            tags.append(t.tag_)
+        pos.append(t.pos_)
+        morphs.append(t._.merged_morph)
+        lemmas.append(t._.merged_lemma)
+        heads.append(t.head.i)
+        deps.append(t.dep_)
+        spaces.append(t._.merged_spaceafter)
+    ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
+    ents = biluo_tags_from_offsets(doc, ent_offsets)
+    raw = ""
+    for word, space in zip(words, spaces):
+        raw += word
+        if space:
+            raw += " "
+    example = Example(doc=raw)
+    example.set_token_annotation(ids=ids, words=words, tags=tags, pos=pos,
+                                 morphs=morphs, lemmas=lemmas, heads=heads,
+                                 deps=deps, entities=ents)
+    return example
+
+
+def merge_conllu_subtokens(lines, doc):
+    # identify and process all subtoken spans to prepare attrs for merging
+    subtok_spans = []
+    for line in lines:
+        parts = line.split("\t")
+        id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
+        if "-" in id_:
+            subtok_start, subtok_end = id_.split("-")
+            subtok_span = doc[int(subtok_start) - 1:int(subtok_end)]
+            subtok_spans.append(subtok_span)
+            # create merged tag, morph, and lemma values
+            tags = []
+            morphs = {}
+            lemmas = []
+            for token in subtok_span:
+                tags.append(token.tag_)
+                lemmas.append(token.lemma_)
+                if token._.merged_morph:
+                    for feature in token._.merged_morph.split("|"):
+                        field, values = feature.split("=", 1)
+                        if not field in morphs:
+                            morphs[field] = set()
+                        for value in values.split(","):
+                            morphs[field].add(value)
+            # create merged features for each morph field
+            for field, values in morphs.items():
+                morphs[field] = field + "=" + ",".join(sorted(values))
+            # set the same attrs on all subtok tokens so that whatever head the
+            # retokenizer chooses, the final attrs are available on that token
+            for token in subtok_span:
+                token._.merged_orth = token.orth_
+                token._.merged_lemma = " ".join(lemmas)
+                token.tag_ = "_".join(tags)
+                token._.merged_morph = "|".join(sorted(morphs.values()))
+                token._.merged_spaceafter = True if subtok_span[-1].whitespace_ else False
+
+    with doc.retokenize() as retokenizer:
+        for span in subtok_spans:
+            retokenizer.merge(span)
+
+    return doc
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -54,6 +54,43 @@ def test_cli_converters_conllu2json_name_ner_map():
    assert [t["ner"] for t in tokens] == ["O", "B-PERSON", "L-PERSON", "O", "O"]


+def test_cli_converters_conllu2json_subtokens():
+    # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu
+    lines = [
+        "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O",
+        "2-3\tFE\t_\t_\t_\t_\t_\t_\t_\t_",
+        "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tname=B-PER",
+        "3\tEilertsen\tEilertsen\tX\t_\tGender=Fem|Tense=past\t2\tname\t_\tname=I-PER",
+        "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O",
+        "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O",
+    ]
+    input_data = "\n".join(lines)
+    converted = conllu2json(input_data, n_sents=1, merge_subtokens=True,
+                            append_morphology=True)
+    assert len(converted) == 1
+    assert converted[0]["id"] == 0
+    assert len(converted[0]["paragraphs"]) == 1
+    assert converted[0]["paragraphs"][0]["raw"] == "Dommer FE avstår."
+    assert len(converted[0]["paragraphs"][0]["sentences"]) == 1
+    sent = converted[0]["paragraphs"][0]["sentences"][0]
+    assert len(sent["tokens"]) == 4
+    tokens = sent["tokens"]
+    print(tokens)
+    assert [t["orth"] for t in tokens] == ["Dommer", "FE", "avstår", "."]
+    assert [t["tag"] for t in tokens] == [
+        "NOUN__Definite=Ind|Gender=Masc|Number=Sing",
+        "PROPN_X__Gender=Fem,Masc|Tense=past",
+        "VERB__Mood=Ind|Tense=Pres|VerbForm=Fin",
+        "PUNCT"
+    ]
+    assert [t["pos"] for t in tokens] == ['NOUN', 'PROPN', 'VERB', 'PUNCT']
+    assert [t["morph"] for t in tokens] == ['Definite=Ind|Gender=Masc|Number=Sing', 'Gender=Fem,Masc|Tense=past', 'Mood=Ind|Tense=Pres|VerbForm=Fin', '']
+    assert [t["lemma"] for t in tokens] == ['dommer', 'Finn Eilertsen', 'avstå', '$.']
+    assert [t["head"] for t in tokens] == [1, 1, 0, -1]
+    assert [t["dep"] for t in tokens] == ["appos", "nsubj", "ROOT", "punct"]
+    assert [t["ner"] for t in tokens] == ["O", "U-PER", "O", "O"]
+
+
 def test_cli_converters_iob2json():
    lines = [
        "I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",