mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-24 20:51:30 +03:00 
			
		
		
		
	* OrigAnnot class instead of gold.orig_annot list of zipped tuples * from_orig to replace from_annot_tuples * rename to RawAnnot * some unit tests for GoldParse creation and internal format * removing orig_annot and switching to lists instead of tuple * rewriting tuples to use RawAnnot (+ debug statements, WIP) * fix pop() changing the data * small fixes * pop-append fixes * return RawAnnot for existing GoldParse to have uniform interface * clean up imports * fix merge_sents * add unit test for 4402 with new structure (not working yet) * introduce DocAnnot * typo fixes * add unit test for merge_sents * rename from_orig to from_raw * fixing unit tests * fix nn parser * read_annots to produce text, doc_annot pairs * _make_golds fix * rename golds_to_gold_annots * small fixes * fix encoding * have golds_to_gold_annots use DocAnnot * missed a spot * merge_sents as function in DocAnnot * allow specifying only part of the token-level annotations * refactor with Example class + underlying dicts * pipeline components to work with Example objects (wip) * input checking * fix yielding * fix calls to update * small fixes * fix scorer unit test with new format * fix kwargs order * fixes for ud and conllu scripts * fix reading data for conllu script * add in proper errors (not fixed numbering yet to avoid merge conflicts) * fixing few more small bugs * fix EL script
		
			
				
	
	
		
			147 lines
		
	
	
		
			4.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			147 lines
		
	
	
		
			4.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # coding: utf8
 | |
| from __future__ import unicode_literals
 | |
| 
 | |
| import re
 | |
| 
 | |
| from spacy.gold import Example
 | |
| from ...gold import iob_to_biluo
 | |
| 
 | |
| 
 | |
| def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_):
 | |
|     """
 | |
|     Convert conllu files into JSON format for use with train cli.
 | |
|     use_morphology parameter enables appending morphology to tags, which is
 | |
|     useful for languages such as Spanish, where UD tags are not so rich.
 | |
| 
 | |
|     Extract NER tags if available and convert them so that they follow
 | |
|     BILUO and the Wikipedia scheme
 | |
|     """
 | |
|     # by @dvsrepo, via #11 explosion/spacy-dev-resources
 | |
|     # by @katarkor
 | |
|     docs = []
 | |
|     sentences = []
 | |
|     conll_data = read_conllx(input_data, use_morphology=use_morphology)
 | |
|     checked_for_ner = False
 | |
|     has_ner_tags = False
 | |
|     for i, example in enumerate(conll_data):
 | |
|         for token_annotation in example.token_annotations:
 | |
|             if not checked_for_ner:
 | |
|                 has_ner_tags = is_ner(token_annotation.entities[0])
 | |
|                 checked_for_ner = True
 | |
|             sentences.append(generate_sentence(token_annotation, has_ner_tags))
 | |
|             # Real-sized documents could be extracted using the comments on the
 | |
|             # conluu document
 | |
|             if len(sentences) % n_sents == 0:
 | |
|                 doc = create_doc(sentences, i)
 | |
|                 docs.append(doc)
 | |
|                 sentences = []
 | |
|     return docs
 | |
| 
 | |
| 
 | |
| def is_ner(tag):
 | |
|     """
 | |
|     Check the 10th column of the first token to determine if the file contains
 | |
|     NER tags
 | |
|     """
 | |
|     tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag)
 | |
|     if tag_match:
 | |
|         return True
 | |
|     elif tag == "O":
 | |
|         return True
 | |
|     else:
 | |
|         return False
 | |
| 
 | |
| 
 | |
| def read_conllx(input_data, use_morphology=False, n=0):
 | |
|     """ Yield example data points, one for each sentence """
 | |
|     i = 0
 | |
|     for sent in input_data.strip().split("\n\n"):
 | |
|         lines = sent.strip().split("\n")
 | |
|         if lines:
 | |
|             while lines[0].startswith("#"):
 | |
|                 lines.pop(0)
 | |
|             ids, words, tags, heads, deps, ents = [], [], [], [], [], []
 | |
|             for line in lines:
 | |
|                 parts = line.split("\t")
 | |
|                 id_, word, lemma, pos, tag, morph, head, dep, _1, iob = parts
 | |
|                 if "-" in id_ or "." in id_:
 | |
|                     continue
 | |
|                 try:
 | |
|                     id_ = int(id_) - 1
 | |
|                     head = (int(head) - 1) if head != "0" else id_
 | |
|                     dep = "ROOT" if dep == "root" else dep
 | |
|                     tag = pos if tag == "_" else tag
 | |
|                     tag = tag + "__" + morph if use_morphology else tag
 | |
|                     iob = iob if iob else "O"
 | |
| 
 | |
|                     ids.append(id_)
 | |
|                     words.append(word)
 | |
|                     tags.append(tag)
 | |
|                     heads.append(head)
 | |
|                     deps.append(dep)
 | |
|                     ents.append(iob)
 | |
|                 except:  # noqa: E722
 | |
|                     print(line)
 | |
|                     raise
 | |
|             example = Example(doc=None)
 | |
|             example.add_token_annotation(ids=ids, words=words, tags=tags,
 | |
|                                          heads=heads, deps=deps, entities=ents)
 | |
|             yield example
 | |
|             i += 1
 | |
|             if 1 <= n <= i:
 | |
|                 break
 | |
| 
 | |
| 
 | |
| def simplify_tags(iob):
 | |
|     """
 | |
|     Simplify tags obtained from the dataset in order to follow Wikipedia
 | |
|     scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while
 | |
|     'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to
 | |
|     'MISC'.
 | |
|     """
 | |
|     new_iob = []
 | |
|     for tag in iob:
 | |
|         tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag)
 | |
|         if tag_match:
 | |
|             prefix = tag_match.group(1)
 | |
|             suffix = tag_match.group(2)
 | |
|             if suffix == "GPE_LOC":
 | |
|                 suffix = "LOC"
 | |
|             elif suffix == "GPE_ORG":
 | |
|                 suffix = "ORG"
 | |
|             elif suffix != "PER" and suffix != "LOC" and suffix != "ORG":
 | |
|                 suffix = "MISC"
 | |
|             tag = prefix + "-" + suffix
 | |
|         new_iob.append(tag)
 | |
|     return new_iob
 | |
| 
 | |
| 
 | |
| def generate_sentence(token_annotation, has_ner_tags):
 | |
|     sentence = {}
 | |
|     tokens = []
 | |
|     if has_ner_tags:
 | |
|         iob = simplify_tags(token_annotation.entities)
 | |
|         biluo = iob_to_biluo(iob)
 | |
|     for i, id in enumerate(token_annotation.ids):
 | |
|         token = {}
 | |
|         token["id"] = id
 | |
|         token["orth"] = token_annotation.words[i]
 | |
|         token["tag"] = token_annotation.tags[i]
 | |
|         token["head"] = token_annotation.heads[i] - id
 | |
|         token["dep"] = token_annotation.deps[i]
 | |
|         if has_ner_tags:
 | |
|             token["ner"] = biluo[i]
 | |
|         tokens.append(token)
 | |
|     sentence["tokens"] = tokens
 | |
|     return sentence
 | |
| 
 | |
| 
 | |
| def create_doc(sentences, id):
 | |
|     doc = {}
 | |
|     paragraph = {}
 | |
|     doc["id"] = id
 | |
|     doc["paragraphs"] = []
 | |
|     paragraph["sentences"] = sentences
 | |
|     doc["paragraphs"].append(paragraph)
 | |
|     return doc
 |