mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-14 05:37:03 +03:00
492d1ec5de
* remove empty gold.pyx * add alignment unit test (to be used in docs) * ensure that Alignment is only used on equal texts * additional test using example.alignment * formatting Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
35 lines
961 B
Python
35 lines
961 B
Python
from typing import List
|
|
import numpy
|
|
from thinc.types import Ragged
|
|
from dataclasses import dataclass
|
|
import tokenizations
|
|
|
|
from ..errors import Errors
|
|
|
|
|
|
@dataclass
|
|
class Alignment:
|
|
x2y: Ragged
|
|
y2x: Ragged
|
|
|
|
@classmethod
|
|
def from_indices(cls, x2y: List[List[int]], y2x: List[List[int]]) -> "Alignment":
|
|
x2y = _make_ragged(x2y)
|
|
y2x = _make_ragged(y2x)
|
|
return Alignment(x2y=x2y, y2x=y2x)
|
|
|
|
@classmethod
|
|
def from_strings(cls, A: List[str], B: List[str]) -> "Alignment":
|
|
if "".join(A).replace(" ", "").lower() != "".join(B).replace(" ", "").lower():
|
|
raise ValueError(Errors.E949)
|
|
x2y, y2x = tokenizations.get_alignments(A, B)
|
|
return Alignment.from_indices(x2y=x2y, y2x=y2x)
|
|
|
|
|
|
def _make_ragged(indices):
|
|
lengths = numpy.array([len(x) for x in indices], dtype="i")
|
|
flat = []
|
|
for x in indices:
|
|
flat.extend(x)
|
|
return Ragged(numpy.array(flat, dtype="i"), lengths)
|