mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	* Fix for Issue 4665 - conllu2json - Allowing HEAD to be an underscore * Added contributor agreement
		
			
				
	
	
		
			142 lines
		
	
	
		
			4.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			142 lines
		
	
	
		
			4.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# coding: utf8
 | 
						|
from __future__ import unicode_literals
 | 
						|
 | 
						|
import re
 | 
						|
 | 
						|
from ...gold import iob_to_biluo
 | 
						|
 | 
						|
 | 
						|
def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_):
 | 
						|
    """
 | 
						|
    Convert conllu files into JSON format for use with train cli.
 | 
						|
    use_morphology parameter enables appending morphology to tags, which is
 | 
						|
    useful for languages such as Spanish, where UD tags are not so rich.
 | 
						|
 | 
						|
    Extract NER tags if available and convert them so that they follow
 | 
						|
    BILUO and the Wikipedia scheme
 | 
						|
    """
 | 
						|
    # by @dvsrepo, via #11 explosion/spacy-dev-resources
 | 
						|
    # by @katarkor
 | 
						|
    docs = []
 | 
						|
    sentences = []
 | 
						|
    conll_tuples = read_conllx(input_data, use_morphology=use_morphology)
 | 
						|
    checked_for_ner = False
 | 
						|
    has_ner_tags = False
 | 
						|
    for i, (raw_text, tokens) in enumerate(conll_tuples):
 | 
						|
        sentence, brackets = tokens[0]
 | 
						|
        if not checked_for_ner:
 | 
						|
            has_ner_tags = is_ner(sentence[5][0])
 | 
						|
            checked_for_ner = True
 | 
						|
        sentences.append(generate_sentence(sentence, has_ner_tags))
 | 
						|
        # Real-sized documents could be extracted using the comments on the
 | 
						|
        # conluu document
 | 
						|
        if len(sentences) % n_sents == 0:
 | 
						|
            doc = create_doc(sentences, i)
 | 
						|
            docs.append(doc)
 | 
						|
            sentences = []
 | 
						|
    if sentences:
 | 
						|
        doc = create_doc(sentences, i)
 | 
						|
        docs.append(doc)
 | 
						|
    return docs
 | 
						|
 | 
						|
 | 
						|
def is_ner(tag):
 | 
						|
    """
 | 
						|
    Check the 10th column of the first token to determine if the file contains
 | 
						|
    NER tags
 | 
						|
    """
 | 
						|
    tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag)
 | 
						|
    if tag_match:
 | 
						|
        return True
 | 
						|
    elif tag == "O":
 | 
						|
        return True
 | 
						|
    else:
 | 
						|
        return False
 | 
						|
 | 
						|
 | 
						|
def read_conllx(input_data, use_morphology=False, n=0):
 | 
						|
    i = 0
 | 
						|
    for sent in input_data.strip().split("\n\n"):
 | 
						|
        lines = sent.strip().split("\n")
 | 
						|
        if lines:
 | 
						|
            while lines[0].startswith("#"):
 | 
						|
                lines.pop(0)
 | 
						|
            tokens = []
 | 
						|
            for line in lines:
 | 
						|
 | 
						|
                parts = line.split("\t")
 | 
						|
                id_, word, lemma, pos, tag, morph, head, dep, _1, iob = parts
 | 
						|
                if "-" in id_ or "." in id_:
 | 
						|
                    continue
 | 
						|
                try:
 | 
						|
                    id_ = int(id_) - 1
 | 
						|
                    head = (int(head) - 1) if head not in ["0", "_"] else id_
 | 
						|
                    dep = "ROOT" if dep == "root" else dep
 | 
						|
                    tag = pos if tag == "_" else tag
 | 
						|
                    tag = tag + "__" + morph if use_morphology else tag
 | 
						|
                    iob = iob if iob else "O"
 | 
						|
                    tokens.append((id_, word, tag, head, dep, iob))
 | 
						|
                except:  # noqa: E722
 | 
						|
                    print(line)
 | 
						|
                    raise
 | 
						|
            tuples = [list(t) for t in zip(*tokens)]
 | 
						|
            yield (None, [[tuples, []]])
 | 
						|
            i += 1
 | 
						|
            if n >= 1 and i >= n:
 | 
						|
                break
 | 
						|
 | 
						|
 | 
						|
def simplify_tags(iob):
 | 
						|
    """
 | 
						|
    Simplify tags obtained from the dataset in order to follow Wikipedia
 | 
						|
    scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while
 | 
						|
    'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to
 | 
						|
    'MISC'.
 | 
						|
    """
 | 
						|
    new_iob = []
 | 
						|
    for tag in iob:
 | 
						|
        tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag)
 | 
						|
        if tag_match:
 | 
						|
            prefix = tag_match.group(1)
 | 
						|
            suffix = tag_match.group(2)
 | 
						|
            if suffix == "GPE_LOC":
 | 
						|
                suffix = "LOC"
 | 
						|
            elif suffix == "GPE_ORG":
 | 
						|
                suffix = "ORG"
 | 
						|
            elif suffix != "PER" and suffix != "LOC" and suffix != "ORG":
 | 
						|
                suffix = "MISC"
 | 
						|
            tag = prefix + "-" + suffix
 | 
						|
        new_iob.append(tag)
 | 
						|
    return new_iob
 | 
						|
 | 
						|
 | 
						|
def generate_sentence(sent, has_ner_tags):
 | 
						|
    (id_, word, tag, head, dep, iob) = sent
 | 
						|
    sentence = {}
 | 
						|
    tokens = []
 | 
						|
    if has_ner_tags:
 | 
						|
        iob = simplify_tags(iob)
 | 
						|
        biluo = iob_to_biluo(iob)
 | 
						|
    for i, id in enumerate(id_):
 | 
						|
        token = {}
 | 
						|
        token["id"] = id
 | 
						|
        token["orth"] = word[i]
 | 
						|
        token["tag"] = tag[i]
 | 
						|
        token["head"] = head[i] - id
 | 
						|
        token["dep"] = dep[i]
 | 
						|
        if has_ner_tags:
 | 
						|
            token["ner"] = biluo[i]
 | 
						|
        tokens.append(token)
 | 
						|
    sentence["tokens"] = tokens
 | 
						|
    return sentence
 | 
						|
 | 
						|
 | 
						|
def create_doc(sentences, id):
 | 
						|
    doc = {}
 | 
						|
    paragraph = {}
 | 
						|
    doc["id"] = id
 | 
						|
    doc["paragraphs"] = []
 | 
						|
    paragraph["sentences"] = sentences
 | 
						|
    doc["paragraphs"].append(paragraph)
 | 
						|
    return doc
 |