spaCy/spacy/gold/align.py
Sofie Van Landeghem 492d1ec5de
Prevent alignment when texts don't match (#5867)
* remove empty gold.pyx

* add alignment unit test (to be used in docs)

* ensure that Alignment is only used on equal texts

* additional test using example.alignment

* formatting

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
2020-08-04 16:29:18 +02:00

35 lines
961 B
Python

from typing import List
import numpy
from thinc.types import Ragged
from dataclasses import dataclass
import tokenizations
from ..errors import Errors
@dataclass
class Alignment:
x2y: Ragged
y2x: Ragged
@classmethod
def from_indices(cls, x2y: List[List[int]], y2x: List[List[int]]) -> "Alignment":
x2y = _make_ragged(x2y)
y2x = _make_ragged(y2x)
return Alignment(x2y=x2y, y2x=y2x)
@classmethod
def from_strings(cls, A: List[str], B: List[str]) -> "Alignment":
if "".join(A).replace(" ", "").lower() != "".join(B).replace(" ", "").lower():
raise ValueError(Errors.E949)
x2y, y2x = tokenizations.get_alignments(A, B)
return Alignment.from_indices(x2y=x2y, y2x=y2x)
def _make_ragged(indices):
lengths = numpy.array([len(x) for x in indices], dtype="i")
flat = []
for x in indices:
flat.extend(x)
return Ragged(numpy.array(flat, dtype="i"), lengths)