Add offsets_from_biluo_tags helper and tests (see #1626)

2025-10-15 08:16:36 +03:00 · 2017-11-26 16:38:01 +01:00 · 2017-11-26 16:38:01 +01:00 · c699aec089
commit c699aec089
parent e4ee666be5
2 changed files with 31 additions and 1 deletions
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -541,5 +541,24 @@ def biluo_tags_from_offsets(doc, entities, missing='O'):
    return biluo
 def offsets_from_biluo_tags(doc, tags):
    """Encode per-token tags following the BILUO scheme into entity offsets.
    doc (Doc): The document that the BILUO tags refer to.
    entities (iterable): A sequence of BILUO tags with each tag describing one
        token. Each tags string will be of the form of either "", "O" or
        "{action}-{label}", where action is one of "B", "I", "L", "U".
    RETURNS (list): A sequence of `(start, end, label)` triples. `start` and
        `end` will be character-offset integers denoting the slice into the
        original string.
    """
    token_offsets = tags_to_entities(tags)
    offsets = []
    for label, start_idx, end_idx in token_offsets:
        span = doc[start_idx : end_idx + 1]
        offsets.append((span.start_char, span.end_char, label))
    return offsets
 def is_punct_label(label):
    return label == 'P' or label.lower() == 'punct'
--- a/spacy/tests/gold/test_biluo.py
+++ b/spacy/tests/gold/test_biluo.py
@ -1,7 +1,7 @@
 # coding: utf-8
 from __future__ import unicode_literals
-from ...gold import biluo_tags_from_offsets
+from ...gold import biluo_tags_from_offsets, offsets_from_biluo_tags
 from ...tokens.doc import Doc
 import pytest
@ -41,3 +41,14 @@ def test_gold_biluo_misalign(en_vocab):
    entities = [(len("I flew to "), len("I flew to San Francisco Valley"), 'LOC')]
    tags = biluo_tags_from_offsets(doc, entities)
    assert tags == ['O', 'O', 'O', '-', '-', '-']
 def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
    text = "I flew to Silicon Valley via London."
    biluo_tags = ['O', 'O', 'O', 'B-LOC', 'L-LOC', 'O', 'U-GPE', 'O']
    offsets = [(10, 24, 'LOC'), (29, 35, 'GPE')]
    doc = en_tokenizer(text)
    biluo_tags_converted = biluo_tags_from_offsets(doc, offsets)
    assert biluo_tags_converted == biluo_tags
    offsets_converted = offsets_from_biluo_tags(doc, biluo_tags)
    assert offsets_converted == offsets