spaCy/spacy/cli/converters/conllu2json.py
Ines Montani 37c7c85a86 💫 New JSON helpers, training data internals & CLI rewrite (#2932)
* Support nowrap setting in util.prints

* Tidy up and fix whitespace

* Simplify script and use read_jsonl helper

* Add JSON schemas (see #2928)

* Deprecate Doc.print_tree

Will be replaced with Doc.to_json, which will produce a unified format

* Add Doc.to_json() method (see #2928)

Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space.

* Remove outdated test

* Add write_json and write_jsonl helpers

* WIP: Update spacy train

* Tidy up spacy train

* WIP: Use wasabi for formatting

* Add GoldParse helpers for JSON format

* WIP: add debug-data command

* Fix typo

* Add missing import

* Update wasabi pin

* Add missing import

* 💫 Refactor CLI (#2943)

To be merged into #2932.

## Description
- [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi)
- [x] use [`black`](https://github.com/ambv/black) for auto-formatting
- [x] add `flake8` config
- [x] move all messy UD-related scripts to `cli.ud`
- [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO)

### Types of change
enhancement

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Update wasabi pin

* Delete old test

* Update errors

* Fix typo

* Tidy up and format remaining code

* Fix formatting

* Improve formatting of messages

* Auto-format remaining code

* Add tok2vec stuff to spacy.train

* Fix typo

* Update wasabi pin

* Fix path checks for when train() is called as function

* Reformat and tidy up pretrain script

* Update argument annotations

* Raise error if model language doesn't match lang

* Document new train command
2018-11-30 20:16:14 +01:00

138 lines
4.2 KiB
Python

# coding: utf8
from __future__ import unicode_literals
import re
from ...gold import iob_to_biluo
def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None):
"""
Convert conllu files into JSON format for use with train cli.
use_morphology parameter enables appending morphology to tags, which is
useful for languages such as Spanish, where UD tags are not so rich.
Extract NER tags if available and convert them so that they follow
BILUO and the Wikipedia scheme
"""
# by @dvsrepo, via #11 explosion/spacy-dev-resources
# by @katarkor
docs = []
sentences = []
conll_tuples = read_conllx(input_data, use_morphology=use_morphology)
checked_for_ner = False
has_ner_tags = False
for i, (raw_text, tokens) in enumerate(conll_tuples):
sentence, brackets = tokens[0]
if not checked_for_ner:
has_ner_tags = is_ner(sentence[5][0])
checked_for_ner = True
sentences.append(generate_sentence(sentence, has_ner_tags))
# Real-sized documents could be extracted using the comments on the
# conluu document
if len(sentences) % n_sents == 0:
doc = create_doc(sentences, i)
docs.append(doc)
sentences = []
return docs
def is_ner(tag):
"""
Check the 10th column of the first token to determine if the file contains
NER tags
"""
tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag)
if tag_match:
return True
elif tag == "O":
return True
else:
return False
def read_conllx(input_data, use_morphology=False, n=0):
i = 0
for sent in input_data.strip().split("\n\n"):
lines = sent.strip().split("\n")
if lines:
while lines[0].startswith("#"):
lines.pop(0)
tokens = []
for line in lines:
parts = line.split("\t")
id_, word, lemma, pos, tag, morph, head, dep, _1, iob = parts
if "-" in id_ or "." in id_:
continue
try:
id_ = int(id_) - 1
head = (int(head) - 1) if head != "0" else id_
dep = "ROOT" if dep == "root" else dep
tag = pos if tag == "_" else tag
tag = tag + "__" + morph if use_morphology else tag
tokens.append((id_, word, tag, head, dep, iob))
except: # noqa: E722
print(line)
raise
tuples = [list(t) for t in zip(*tokens)]
yield (None, [[tuples, []]])
i += 1
if n >= 1 and i >= n:
break
def simplify_tags(iob):
"""
Simplify tags obtained from the dataset in order to follow Wikipedia
scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while
'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to
'MISC'.
"""
new_iob = []
for tag in iob:
tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag)
if tag_match:
prefix = tag_match.group(1)
suffix = tag_match.group(2)
if suffix == "GPE_LOC":
suffix = "LOC"
elif suffix == "GPE_ORG":
suffix = "ORG"
elif suffix != "PER" and suffix != "LOC" and suffix != "ORG":
suffix = "MISC"
tag = prefix + "-" + suffix
new_iob.append(tag)
return new_iob
def generate_sentence(sent, has_ner_tags):
(id_, word, tag, head, dep, iob) = sent
sentence = {}
tokens = []
if has_ner_tags:
iob = simplify_tags(iob)
biluo = iob_to_biluo(iob)
for i, id in enumerate(id_):
token = {}
token["id"] = id
token["orth"] = word[i]
token["tag"] = tag[i]
token["head"] = head[i] - id
token["dep"] = dep[i]
if has_ner_tags:
token["ner"] = biluo[i]
tokens.append(token)
sentence["tokens"] = tokens
return sentence
def create_doc(sentences, id):
doc = {}
paragraph = {}
doc["id"] = id
doc["paragraphs"] = []
paragraph["sentences"] = sentences
doc["paragraphs"].append(paragraph)
return doc