mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
Update converter
This commit is contained in:
parent
4262f231c5
commit
90ecb5d857
|
@ -12,6 +12,7 @@ def conllu2docs(
|
||||||
n_sents=10,
|
n_sents=10,
|
||||||
append_morphology=False,
|
append_morphology=False,
|
||||||
ner_map=None,
|
ner_map=None,
|
||||||
|
tag_map=None,
|
||||||
merge_subtokens=False,
|
merge_subtokens=False,
|
||||||
no_print=False,
|
no_print=False,
|
||||||
**_
|
**_
|
||||||
|
@ -32,8 +33,10 @@ def conllu2docs(
|
||||||
append_morphology=append_morphology,
|
append_morphology=append_morphology,
|
||||||
ner_tag_pattern=MISC_NER_PATTERN,
|
ner_tag_pattern=MISC_NER_PATTERN,
|
||||||
ner_map=ner_map,
|
ner_map=ner_map,
|
||||||
|
tag_map=tag_map,
|
||||||
merge_subtokens=merge_subtokens,
|
merge_subtokens=merge_subtokens,
|
||||||
)
|
)
|
||||||
|
sent_docs = list(sent_docs)
|
||||||
docs = []
|
docs = []
|
||||||
sent_docs_to_merge = []
|
sent_docs_to_merge = []
|
||||||
for sent_doc in sent_docs:
|
for sent_doc in sent_docs:
|
||||||
|
@ -70,10 +73,14 @@ def read_conllx(
|
||||||
merge_subtokens=False,
|
merge_subtokens=False,
|
||||||
ner_tag_pattern="",
|
ner_tag_pattern="",
|
||||||
ner_map=None,
|
ner_map=None,
|
||||||
|
tag_map=None
|
||||||
):
|
):
|
||||||
""" Yield docs, one for each sentence """
|
""" Yield docs, one for each sentence """
|
||||||
vocab = Vocab() # need vocab to make a minimal Doc
|
vocab = Vocab() # need vocab to make a minimal Doc
|
||||||
for sent in input_data.strip().split("\n\n"):
|
# Need to support older conll formats, where we might have spaces between
|
||||||
|
# the lines =/. Stanford convert 3.3 seems to do this?
|
||||||
|
segment_re = re.compile(r"\n *\n")
|
||||||
|
for sent in segment_re.split(input_data.strip()):
|
||||||
lines = sent.strip().split("\n")
|
lines = sent.strip().split("\n")
|
||||||
if lines:
|
if lines:
|
||||||
while lines[0].startswith("#"):
|
while lines[0].startswith("#"):
|
||||||
|
@ -85,6 +92,7 @@ def read_conllx(
|
||||||
merge_subtokens=merge_subtokens,
|
merge_subtokens=merge_subtokens,
|
||||||
append_morphology=append_morphology,
|
append_morphology=append_morphology,
|
||||||
ner_map=ner_map,
|
ner_map=ner_map,
|
||||||
|
tag_map=tag_map
|
||||||
)
|
)
|
||||||
yield doc
|
yield doc
|
||||||
|
|
||||||
|
@ -135,6 +143,7 @@ def doc_from_conllu_sentence(
|
||||||
merge_subtokens=False,
|
merge_subtokens=False,
|
||||||
append_morphology=False,
|
append_morphology=False,
|
||||||
ner_map=None,
|
ner_map=None,
|
||||||
|
tag_map=None
|
||||||
):
|
):
|
||||||
"""Create an Example from the lines for one CoNLL-U sentence, merging
|
"""Create an Example from the lines for one CoNLL-U sentence, merging
|
||||||
subtokens and appending morphology to tags if required.
|
subtokens and appending morphology to tags if required.
|
||||||
|
@ -161,6 +170,8 @@ def doc_from_conllu_sentence(
|
||||||
line = lines[i]
|
line = lines[i]
|
||||||
parts = line.split("\t")
|
parts = line.split("\t")
|
||||||
id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
|
id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
|
||||||
|
if tag_map is not None:
|
||||||
|
pos = tag_map[tag]["POS"]
|
||||||
if "." in id_:
|
if "." in id_:
|
||||||
continue
|
continue
|
||||||
if "-" in id_:
|
if "-" in id_:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user