mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Filter bad retokenizations
This commit is contained in:
		
							parent
							
								
									42bc3ad73b
								
							
						
					
					
						commit
						4c8730526b
					
				|  | @ -231,9 +231,14 @@ def write_conllu(docs, file_): | ||||||
|     for i, doc in enumerate(docs): |     for i, doc in enumerate(docs): | ||||||
|         matches = merger(doc) |         matches = merger(doc) | ||||||
|         spans = [doc[start : end + 1] for _, start, end in matches] |         spans = [doc[start : end + 1] for _, start, end in matches] | ||||||
|  |         seen_tokens = set() | ||||||
|         with doc.retokenize() as retokenizer: |         with doc.retokenize() as retokenizer: | ||||||
|             for span in spans: |             for span in spans: | ||||||
|  |                 span_tokens = set(range(span.start, span.end)) | ||||||
|  |                 if not span_tokens.intersection(seen_tokens): | ||||||
|                     retokenizer.merge(span) |                     retokenizer.merge(span) | ||||||
|  |                     seen_tokens.update(span_tokens) | ||||||
|  | 
 | ||||||
|         file_.write("# newdoc id = {i}\n".format(i=i)) |         file_.write("# newdoc id = {i}\n".format(i=i)) | ||||||
|         for j, sent in enumerate(doc.sents): |         for j, sent in enumerate(doc.sents): | ||||||
|             file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j)) |             file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j)) | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user