Update converter

This commit is contained in:
Matthew Honnibal 2020-09-12 19:45:10 +02:00
parent 4262f231c5
commit 90ecb5d857

View File

@ -12,6 +12,7 @@ def conllu2docs(
n_sents=10, n_sents=10,
append_morphology=False, append_morphology=False,
ner_map=None, ner_map=None,
tag_map=None,
merge_subtokens=False, merge_subtokens=False,
no_print=False, no_print=False,
**_ **_
@ -32,8 +33,10 @@ def conllu2docs(
append_morphology=append_morphology, append_morphology=append_morphology,
ner_tag_pattern=MISC_NER_PATTERN, ner_tag_pattern=MISC_NER_PATTERN,
ner_map=ner_map, ner_map=ner_map,
tag_map=tag_map,
merge_subtokens=merge_subtokens, merge_subtokens=merge_subtokens,
) )
sent_docs = list(sent_docs)
docs = [] docs = []
sent_docs_to_merge = [] sent_docs_to_merge = []
for sent_doc in sent_docs: for sent_doc in sent_docs:
@ -70,10 +73,14 @@ def read_conllx(
merge_subtokens=False, merge_subtokens=False,
ner_tag_pattern="", ner_tag_pattern="",
ner_map=None, ner_map=None,
tag_map=None
): ):
""" Yield docs, one for each sentence """ """ Yield docs, one for each sentence """
vocab = Vocab() # need vocab to make a minimal Doc vocab = Vocab() # need vocab to make a minimal Doc
for sent in input_data.strip().split("\n\n"): # Need to support older conll formats, where we might have spaces between
# the lines =/. Stanford convert 3.3 seems to do this?
segment_re = re.compile(r"\n *\n")
for sent in segment_re.split(input_data.strip()):
lines = sent.strip().split("\n") lines = sent.strip().split("\n")
if lines: if lines:
while lines[0].startswith("#"): while lines[0].startswith("#"):
@ -85,6 +92,7 @@ def read_conllx(
merge_subtokens=merge_subtokens, merge_subtokens=merge_subtokens,
append_morphology=append_morphology, append_morphology=append_morphology,
ner_map=ner_map, ner_map=ner_map,
tag_map=tag_map
) )
yield doc yield doc
@ -135,6 +143,7 @@ def doc_from_conllu_sentence(
merge_subtokens=False, merge_subtokens=False,
append_morphology=False, append_morphology=False,
ner_map=None, ner_map=None,
tag_map=None
): ):
"""Create an Example from the lines for one CoNLL-U sentence, merging """Create an Example from the lines for one CoNLL-U sentence, merging
subtokens and appending morphology to tags if required. subtokens and appending morphology to tags if required.
@ -161,6 +170,8 @@ def doc_from_conllu_sentence(
line = lines[i] line = lines[i]
parts = line.split("\t") parts = line.split("\t")
id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
if tag_map is not None:
pos = tag_map[tag]["POS"]
if "." in id_: if "." in id_:
continue continue
if "-" in id_: if "-" in id_: