spaCy/spacy/gold/align.py
Matthew Honnibal cc477be952
Improve gold-standard alignment (#5711)
* Remove previous alignment

* Implement better alignment, using ragged data structure

* Use pytokenizations for alignment

* Fixes

* Fixes

* Fix overlapping entities in alignment

* Fix align split_sents

* Update test

* Commit align.py

* Try to appease setuptools

* Fix flake8

* use realistic entities for testing

* Update tests for better alignment

* Improve alignment heuristic

Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
2020-07-06 17:39:31 +02:00

31 lines
807 B
Python

from typing import List
import numpy
from thinc.types import Ragged
from dataclasses import dataclass
import tokenizations
@dataclass
class Alignment:
x2y: Ragged
y2x: Ragged
@classmethod
def from_indices(cls, x2y: List[List[int]], y2x: List[List[int]]) -> "Alignment":
x2y = _make_ragged(x2y)
y2x = _make_ragged(y2x)
return Alignment(x2y=x2y, y2x=y2x)
@classmethod
def from_strings(cls, A: List[str], B: List[str]) -> "Alignment":
x2y, y2x = tokenizations.get_alignments(A, B)
return Alignment.from_indices(x2y=x2y, y2x=y2x)
def _make_ragged(indices):
lengths = numpy.array([len(x) for x in indices], dtype="i")
flat = []
for x in indices:
flat.extend(x)
return Ragged(numpy.array(flat, dtype="i"), lengths)