mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 13:11:03 +03:00 
			
		
		
		
	Merge branch 'master' into develop
This commit is contained in:
		
						commit
						d84b13e02c
					
				|  | @ -4,7 +4,7 @@ from __future__ import unicode_literals | |||
| import plac | ||||
| from pathlib import Path | ||||
| 
 | ||||
| from .converters import conllu2json, iob2json, conll_ner2json | ||||
| from .converters import conllu2json, conllubio2json, iob2json, conll_ner2json | ||||
| from ._messages import Messages | ||||
| from ..util import prints | ||||
| 
 | ||||
|  | @ -12,6 +12,7 @@ from ..util import prints | |||
| # entry to this dict with the file extension mapped to the converter function | ||||
| # imported from /converters. | ||||
| CONVERTERS = { | ||||
|     'conllubio': conllubio2json, | ||||
|     'conllu': conllu2json, | ||||
|     'conll': conllu2json, | ||||
|     'ner': conll_ner2json, | ||||
|  |  | |||
|  | @ -1,3 +1,4 @@ | |||
| from .conllu2json import conllu2json | ||||
| from .conllubio2json import conllubio2json | ||||
| from .iob2json import iob2json | ||||
| from .conll_ner2json import conll_ner2json | ||||
|  |  | |||
							
								
								
									
										95
									
								
								spacy/cli/converters/conllubio2json.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										95
									
								
								spacy/cli/converters/conllubio2json.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,95 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from ...compat import json_dumps, path2str | ||||
| from ...util import prints | ||||
| from ...gold import iob_to_biluo | ||||
| 
 | ||||
| def conllubio2json(input_path, output_path, n_sents=10, use_morphology=False): | ||||
|     """ | ||||
|     Convert conllu files into JSON format for use with train cli. | ||||
|     use_morphology parameter enables appending morphology to tags, which is | ||||
|     useful for languages such as Spanish, where UD tags are not so rich. | ||||
|     """ | ||||
|     # by @dvsrepo, via #11 explosion/spacy-dev-resources | ||||
| 
 | ||||
|     docs = [] | ||||
|     sentences = [] | ||||
|     conll_tuples = read_conllx(input_path, use_morphology=use_morphology) | ||||
| 
 | ||||
|     for i, (raw_text, tokens) in enumerate(conll_tuples): | ||||
|         sentence, brackets = tokens[0] | ||||
|         sentences.append(generate_sentence(sentence)) | ||||
|         # Real-sized documents could be extracted using the comments on the | ||||
|         # conluu document | ||||
|         if(len(sentences) % n_sents == 0): | ||||
|             doc = create_doc(sentences, i) | ||||
|             docs.append(doc) | ||||
|             sentences = [] | ||||
| 
 | ||||
|     output_filename = input_path.parts[-1].replace(".conll", ".json") | ||||
|     output_filename = input_path.parts[-1].replace(".conllu", ".json") | ||||
|     output_file = output_path / output_filename | ||||
|     with output_file.open('w', encoding='utf-8') as f: | ||||
|         f.write(json_dumps(docs)) | ||||
|     prints("Created %d documents" % len(docs), | ||||
|            title="Generated output file %s" % path2str(output_file)) | ||||
| 
 | ||||
| 
 | ||||
| def read_conllx(input_path, use_morphology=False, n=0): | ||||
|     text = input_path.open('r', encoding='utf-8').read() | ||||
|     i = 0 | ||||
|     for sent in text.strip().split('\n\n'): | ||||
|         lines = sent.strip().split('\n') | ||||
|         if lines: | ||||
|             while lines[0].startswith('#'): | ||||
|                 lines.pop(0) | ||||
|             tokens = [] | ||||
|             for line in lines: | ||||
| 
 | ||||
|                 parts = line.split('\t') | ||||
|                 id_, word, lemma, pos, tag, morph, head, dep, _1, ner = parts | ||||
|                 if '-' in id_ or '.' in id_: | ||||
|                     continue | ||||
|                 try: | ||||
|                     id_ = int(id_) - 1 | ||||
|                     head = (int(head) - 1) if head != '0' else id_ | ||||
|                     dep = 'ROOT' if dep == 'root' else dep | ||||
|                     tag = pos if tag == '_' else tag | ||||
|                     tag = tag+'__'+morph  if use_morphology else tag | ||||
|                     ner = ner if ner else 'O' | ||||
|                     tokens.append((id_, word, tag, head, dep, ner)) | ||||
|                 except: | ||||
|                     print(line) | ||||
|                     raise | ||||
|             tuples = [list(t) for t in zip(*tokens)] | ||||
|             yield (None, [[tuples, []]]) | ||||
|             i += 1 | ||||
|             if n >= 1 and i >= n: | ||||
|                 break | ||||
| 
 | ||||
| def generate_sentence(sent): | ||||
|     (id_, word, tag, head, dep, ner) = sent | ||||
|     sentence = {} | ||||
|     tokens = [] | ||||
|     ner = iob_to_biluo(ner) | ||||
|     for i, id in enumerate(id_): | ||||
|         token = {} | ||||
|         token["orth"] = word[i] | ||||
|         token["tag"] = tag[i] | ||||
|         token["head"] = head[i] - id | ||||
|         token["dep"] = dep[i] | ||||
|         token["ner"] = ner[i] | ||||
|         tokens.append(token) | ||||
|     sentence["tokens"] = tokens | ||||
|     return sentence | ||||
| 
 | ||||
| 
 | ||||
| def create_doc(sentences,id): | ||||
|     doc = {} | ||||
|     paragraph = {} | ||||
|     doc["id"] = id | ||||
|     doc["paragraphs"] = [] | ||||
|     paragraph["sentences"] = sentences | ||||
|     doc["paragraphs"].append(paragraph) | ||||
|     return doc | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user