mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	* Clean unused functions from spacy.syntax.conll
This commit is contained in:
		
							parent
							
								
									78487f3e66
								
							
						
					
					
						commit
						efe7a7d7d6
					
				|  | @ -32,69 +32,6 @@ def read_json_file(loc): | |||
|     return paragraphs | ||||
| 
 | ||||
| 
 | ||||
| def read_conll03_file(loc): | ||||
|     sents = [] | ||||
|     text = codecs.open(loc, 'r', 'utf8').read().strip() | ||||
|     for doc in text.split('-DOCSTART- -X- O O'): | ||||
|         doc = doc.strip() | ||||
|         if not doc: | ||||
|             continue | ||||
|         for sent_str in doc.split('\n\n'): | ||||
|             words = [] | ||||
|             tags = [] | ||||
|             iob_ents = [] | ||||
|             ids = [] | ||||
|             lines = sent_str.strip().split('\n') | ||||
|             idx = 0 | ||||
|             for line in lines: | ||||
|                 word, tag, chunk, iob = line.split() | ||||
|                 if tag == '"': | ||||
|                     tag = '``' | ||||
|                 if '|' in tag: | ||||
|                     tag = tag.split('|')[0] | ||||
|                 words.append(word) | ||||
|                 tags.append(tag) | ||||
|                 iob_ents.append(iob) | ||||
|                 ids.append(idx) | ||||
|                 idx += len(word) + 1 | ||||
|             heads = [-1] * len(words) | ||||
|             labels = ['ROOT'] * len(words) | ||||
|             sents.append((' '.join(words), [words], | ||||
|                          (ids, words, tags, heads, labels, _iob_to_biluo(iob_ents)))) | ||||
|     return sents | ||||
| 
 | ||||
| 
 | ||||
| def read_docparse_file(loc): | ||||
|     sents = [] | ||||
|     for sent_str in codecs.open(loc, 'r', 'utf8').read().strip().split('\n\n'): | ||||
|         words = [] | ||||
|         heads = [] | ||||
|         labels = [] | ||||
|         tags = [] | ||||
|         ids = [] | ||||
|         iob_ents = [] | ||||
|         lines = sent_str.strip().split('\n') | ||||
|         raw_text = lines.pop(0).strip() | ||||
|         tok_text = lines.pop(0).strip() | ||||
|         for i, line in enumerate(lines): | ||||
|             id_, word, pos_string, head_idx, label, iob_ent = _parse_line(line) | ||||
|             if label == 'root': | ||||
|                 label = 'ROOT' | ||||
|             words.append(word) | ||||
|             if head_idx < 0: | ||||
|                 head_idx = id_ | ||||
|             ids.append(id_) | ||||
|             heads.append(head_idx) | ||||
|             labels.append(label) | ||||
|             tags.append(pos_string) | ||||
|             iob_ents.append(iob_ent) | ||||
|         tokenized = [s.replace('<SEP>', ' ').split(' ') | ||||
|                      for s in tok_text.split('<SENT>')] | ||||
|         tuples = (ids, words, tags, heads, labels, iob_ents) | ||||
|         sents.append((raw_text, tokenized, tuples, [])) | ||||
|     return sents | ||||
| 
 | ||||
| 
 | ||||
| def _iob_to_biluo(tags): | ||||
|     out = [] | ||||
|     curr_label = None | ||||
|  | @ -128,20 +65,6 @@ def _consume_ent(tags): | |||
|         return [start] + middle + [end] | ||||
| 
 | ||||
| 
 | ||||
| def _parse_line(line): | ||||
|     pieces = line.split() | ||||
|     if len(pieces) == 4: | ||||
|         return 0, pieces[0], pieces[1], int(pieces[2]) - 1, pieces[3] | ||||
|     else: | ||||
|         id_ = int(pieces[0]) | ||||
|         word = pieces[1] | ||||
|         pos = pieces[3] | ||||
|         iob_ent = pieces[5] | ||||
|         head_idx = int(pieces[6]) | ||||
|         label = pieces[7] | ||||
|         return id_, word, pos, head_idx, label, iob_ent | ||||
| 
 | ||||
| 
 | ||||
| cdef class GoldParse: | ||||
|     def __init__(self, tokens, annot_tuples, brackets=tuple()): | ||||
|         self.mem = Pool() | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user