mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	* Remove previous alignment * Implement better alignment, using ragged data structure * Use pytokenizations for alignment * Fixes * Fixes * Fix overlapping entities in alignment * Fix align split_sents * Update test * Commit align.py * Try to appease setuptools * Fix flake8 * use realistic entities for testing * Update tests for better alignment * Improve alignment heuristic Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
		
			
				
	
	
		
			31 lines
		
	
	
		
			807 B
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			31 lines
		
	
	
		
			807 B
		
	
	
	
		
			Python
		
	
	
	
	
	
| from typing import List
 | |
| import numpy
 | |
| from thinc.types import Ragged
 | |
| from dataclasses import dataclass
 | |
| import tokenizations
 | |
| 
 | |
| 
 | |
| @dataclass
 | |
| class Alignment:
 | |
|     x2y: Ragged
 | |
|     y2x: Ragged
 | |
| 
 | |
|     @classmethod
 | |
|     def from_indices(cls, x2y: List[List[int]], y2x: List[List[int]]) -> "Alignment":
 | |
|         x2y = _make_ragged(x2y)
 | |
|         y2x = _make_ragged(y2x)
 | |
|         return Alignment(x2y=x2y, y2x=y2x)
 | |
|     
 | |
|     @classmethod
 | |
|     def from_strings(cls, A: List[str], B: List[str]) -> "Alignment":
 | |
|         x2y, y2x = tokenizations.get_alignments(A, B)
 | |
|         return Alignment.from_indices(x2y=x2y, y2x=y2x)
 | |
| 
 | |
| 
 | |
| def _make_ragged(indices):
 | |
|     lengths = numpy.array([len(x) for x in indices], dtype="i")
 | |
|     flat = []
 | |
|     for x in indices:
 | |
|         flat.extend(x)
 | |
|     return Ragged(numpy.array(flat, dtype="i"), lengths)
 |