mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Add offsets_from_biluo_tags helper and tests (see #1626)
This commit is contained in:
parent
e4ee666be5
commit
c699aec089
|
@ -541,5 +541,24 @@ def biluo_tags_from_offsets(doc, entities, missing='O'):
|
||||||
return biluo
|
return biluo
|
||||||
|
|
||||||
|
|
||||||
|
def offsets_from_biluo_tags(doc, tags):
|
||||||
|
"""Encode per-token tags following the BILUO scheme into entity offsets.
|
||||||
|
|
||||||
|
doc (Doc): The document that the BILUO tags refer to.
|
||||||
|
entities (iterable): A sequence of BILUO tags with each tag describing one
|
||||||
|
token. Each tags string will be of the form of either "", "O" or
|
||||||
|
"{action}-{label}", where action is one of "B", "I", "L", "U".
|
||||||
|
RETURNS (list): A sequence of `(start, end, label)` triples. `start` and
|
||||||
|
`end` will be character-offset integers denoting the slice into the
|
||||||
|
original string.
|
||||||
|
"""
|
||||||
|
token_offsets = tags_to_entities(tags)
|
||||||
|
offsets = []
|
||||||
|
for label, start_idx, end_idx in token_offsets:
|
||||||
|
span = doc[start_idx : end_idx + 1]
|
||||||
|
offsets.append((span.start_char, span.end_char, label))
|
||||||
|
return offsets
|
||||||
|
|
||||||
|
|
||||||
def is_punct_label(label):
|
def is_punct_label(label):
|
||||||
return label == 'P' or label.lower() == 'punct'
|
return label == 'P' or label.lower() == 'punct'
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...gold import biluo_tags_from_offsets
|
from ...gold import biluo_tags_from_offsets, offsets_from_biluo_tags
|
||||||
from ...tokens.doc import Doc
|
from ...tokens.doc import Doc
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
@ -41,3 +41,14 @@ def test_gold_biluo_misalign(en_vocab):
|
||||||
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), 'LOC')]
|
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), 'LOC')]
|
||||||
tags = biluo_tags_from_offsets(doc, entities)
|
tags = biluo_tags_from_offsets(doc, entities)
|
||||||
assert tags == ['O', 'O', 'O', '-', '-', '-']
|
assert tags == ['O', 'O', 'O', '-', '-', '-']
|
||||||
|
|
||||||
|
|
||||||
|
def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
|
||||||
|
text = "I flew to Silicon Valley via London."
|
||||||
|
biluo_tags = ['O', 'O', 'O', 'B-LOC', 'L-LOC', 'O', 'U-GPE', 'O']
|
||||||
|
offsets = [(10, 24, 'LOC'), (29, 35, 'GPE')]
|
||||||
|
doc = en_tokenizer(text)
|
||||||
|
biluo_tags_converted = biluo_tags_from_offsets(doc, offsets)
|
||||||
|
assert biluo_tags_converted == biluo_tags
|
||||||
|
offsets_converted = offsets_from_biluo_tags(doc, biluo_tags)
|
||||||
|
assert offsets_converted == offsets
|
||||||
|
|
Loading…
Reference in New Issue
Block a user