From 4262f231c5e6d916c82c989fbe960e4c897b8cbf Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 12 Sep 2020 18:20:18 +0200 Subject: [PATCH] Fix conversion of older CoNLL parsing files There are a billion "CoNLL" formats, depending on the tool producing them. The Stanford v3.3 converter has a few quirks that the CoNLL-X conversion wasn't handling: * Sentences may have extra spacing in between the newlines * The coarse-grained POS is the same as the fine-grained POS, so we need a tag map to get the coarse-grained POS. Needing the tag map is particularly unfortunate, it feels like something that should be patched on the source data? Adding the extra option may be confusing to people, especially since it *overwrites* the corpus tag. --- spacy/cli/convert.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index ad89b9976..e6c1305dd 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -51,6 +51,7 @@ def convert_cli( ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True), lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"), concatenate: bool = Opt(None, "--concatenate", "-C", help="Concatenate output to a single file"), + tag_map: Optional[Path] = Opt(None, "--tag-map", "-tm", help="Tag mapping (as JSON-encoded dict of tag types)", exists=True), # fmt: on ): """ @@ -84,6 +85,7 @@ def convert_cli( merge_subtokens=merge_subtokens, converter=converter, ner_map=ner_map, + tag_map=tag_map, lang=lang, concatenate=concatenate, silent=silent, @@ -103,6 +105,7 @@ def convert( merge_subtokens: bool = False, converter: str = "auto", ner_map: Optional[Path] = None, + tag_map: Optional[Path] = None, lang: Optional[str] = None, concatenate: bool = False, silent: bool = True, @@ -111,6 +114,7 @@ def convert( if not msg: msg = Printer(no_print=silent) ner_map = srsly.read_json(ner_map) if ner_map is not None else None + tag_map = srsly.read_json(tag_map) if tag_map is not None else None doc_files = [] for input_loc in walk_directory(Path(input_path), converter): input_data = input_loc.open("r", encoding="utf-8").read() @@ -126,6 +130,7 @@ def convert( model=model, no_print=silent, ner_map=ner_map, + tag_map=tag_map, ) doc_files.append((input_loc, docs)) if concatenate: