mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-06 06:30:35 +03:00
Add iob_utils
This commit is contained in:
parent
53e6473e24
commit
156466ca69
189
spacy/_gold/iob_utils.py
Normal file
189
spacy/_gold/iob_utils.py
Normal file
|
@ -0,0 +1,189 @@
|
|||
import warnings
|
||||
from ..errors import Errors, Warnings
|
||||
from ..tokens import Span
|
||||
|
||||
|
||||
def iob_to_biluo(tags):
|
||||
out = []
|
||||
tags = list(tags)
|
||||
while tags:
|
||||
out.extend(_consume_os(tags))
|
||||
out.extend(_consume_ent(tags))
|
||||
return out
|
||||
|
||||
|
||||
def biluo_to_iob(tags):
|
||||
out = []
|
||||
for tag in tags:
|
||||
tag = tag.replace("U-", "B-", 1).replace("L-", "I-", 1)
|
||||
out.append(tag)
|
||||
return out
|
||||
|
||||
|
||||
def _consume_os(tags):
|
||||
while tags and tags[0] == "O":
|
||||
yield tags.pop(0)
|
||||
|
||||
|
||||
def _consume_ent(tags):
|
||||
if not tags:
|
||||
return []
|
||||
tag = tags.pop(0)
|
||||
target_in = "I" + tag[1:]
|
||||
target_last = "L" + tag[1:]
|
||||
length = 1
|
||||
while tags and tags[0] in {target_in, target_last}:
|
||||
length += 1
|
||||
tags.pop(0)
|
||||
label = tag[2:]
|
||||
if length == 1:
|
||||
if len(label) == 0:
|
||||
raise ValueError(Errors.E177.format(tag=tag))
|
||||
return ["U-" + label]
|
||||
else:
|
||||
start = "B-" + label
|
||||
end = "L-" + label
|
||||
middle = [f"I-{label}" for _ in range(1, length - 1)]
|
||||
return [start] + middle + [end]
|
||||
|
||||
|
||||
def biluo_tags_from_offsets(doc, entities, missing="O"):
|
||||
"""Encode labelled spans into per-token tags, using the
|
||||
Begin/In/Last/Unit/Out scheme (BILUO).
|
||||
|
||||
doc (Doc): The document that the entity offsets refer to. The output tags
|
||||
will refer to the token boundaries within the document.
|
||||
entities (iterable): A sequence of `(start, end, label)` triples. `start`
|
||||
and `end` should be character-offset integers denoting the slice into
|
||||
the original string.
|
||||
RETURNS (list): A list of unicode strings, describing the tags. Each tag
|
||||
string will be of the form either "", "O" or "{action}-{label}", where
|
||||
action is one of "B", "I", "L", "U". The string "-" is used where the
|
||||
entity offsets don't align with the tokenization in the `Doc` object.
|
||||
The training algorithm will view these as missing values. "O" denotes a
|
||||
non-entity token. "B" denotes the beginning of a multi-token entity,
|
||||
"I" the inside of an entity of three or more tokens, and "L" the end
|
||||
of an entity of two or more tokens. "U" denotes a single-token entity.
|
||||
|
||||
EXAMPLE:
|
||||
>>> text = 'I like London.'
|
||||
>>> entities = [(len('I like '), len('I like London'), 'LOC')]
|
||||
>>> doc = nlp.tokenizer(text)
|
||||
>>> tags = biluo_tags_from_offsets(doc, entities)
|
||||
>>> assert tags == ["O", "O", 'U-LOC', "O"]
|
||||
"""
|
||||
# Ensure no overlapping entity labels exist
|
||||
tokens_in_ents = {}
|
||||
|
||||
starts = {token.idx: token.i for token in doc}
|
||||
ends = {token.idx + len(token): token.i for token in doc}
|
||||
biluo = ["-" for _ in doc]
|
||||
# Handle entity cases
|
||||
for start_char, end_char, label in entities:
|
||||
for token_index in range(start_char, end_char):
|
||||
if token_index in tokens_in_ents.keys():
|
||||
raise ValueError(
|
||||
Errors.E103.format(
|
||||
span1=(
|
||||
tokens_in_ents[token_index][0],
|
||||
tokens_in_ents[token_index][1],
|
||||
tokens_in_ents[token_index][2],
|
||||
),
|
||||
span2=(start_char, end_char, label),
|
||||
)
|
||||
)
|
||||
tokens_in_ents[token_index] = (start_char, end_char, label)
|
||||
|
||||
start_token = starts.get(start_char)
|
||||
end_token = ends.get(end_char)
|
||||
# Only interested if the tokenization is correct
|
||||
if start_token is not None and end_token is not None:
|
||||
if start_token == end_token:
|
||||
biluo[start_token] = f"U-{label}"
|
||||
else:
|
||||
biluo[start_token] = f"B-{label}"
|
||||
for i in range(start_token + 1, end_token):
|
||||
biluo[i] = f"I-{label}"
|
||||
biluo[end_token] = f"L-{label}"
|
||||
# Now distinguish the O cases from ones where we miss the tokenization
|
||||
entity_chars = set()
|
||||
for start_char, end_char, label in entities:
|
||||
for i in range(start_char, end_char):
|
||||
entity_chars.add(i)
|
||||
for token in doc:
|
||||
for i in range(token.idx, token.idx + len(token)):
|
||||
if i in entity_chars:
|
||||
break
|
||||
else:
|
||||
biluo[token.i] = missing
|
||||
if "-" in biluo:
|
||||
ent_str = str(entities)
|
||||
warnings.warn(
|
||||
Warnings.W030.format(
|
||||
text=doc.text[:50] + "..." if len(doc.text) > 50 else doc.text,
|
||||
entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
|
||||
)
|
||||
)
|
||||
return biluo
|
||||
|
||||
|
||||
def spans_from_biluo_tags(doc, tags):
|
||||
"""Encode per-token tags following the BILUO scheme into Span object, e.g.
|
||||
to overwrite the doc.ents.
|
||||
|
||||
doc (Doc): The document that the BILUO tags refer to.
|
||||
entities (iterable): A sequence of BILUO tags with each tag describing one
|
||||
token. Each tags string will be of the form of either "", "O" or
|
||||
"{action}-{label}", where action is one of "B", "I", "L", "U".
|
||||
RETURNS (list): A sequence of Span objects.
|
||||
"""
|
||||
token_offsets = tags_to_entities(tags)
|
||||
spans = []
|
||||
for label, start_idx, end_idx in token_offsets:
|
||||
span = Span(doc, start_idx, end_idx + 1, label=label)
|
||||
spans.append(span)
|
||||
return spans
|
||||
|
||||
|
||||
def offsets_from_biluo_tags(doc, tags):
|
||||
"""Encode per-token tags following the BILUO scheme into entity offsets.
|
||||
|
||||
doc (Doc): The document that the BILUO tags refer to.
|
||||
entities (iterable): A sequence of BILUO tags with each tag describing one
|
||||
token. Each tags string will be of the form of either "", "O" or
|
||||
"{action}-{label}", where action is one of "B", "I", "L", "U".
|
||||
RETURNS (list): A sequence of `(start, end, label)` triples. `start` and
|
||||
`end` will be character-offset integers denoting the slice into the
|
||||
original string.
|
||||
"""
|
||||
spans = spans_from_biluo_tags(doc, tags)
|
||||
return [(span.start_char, span.end_char, span.label_) for span in spans]
|
||||
|
||||
|
||||
def tags_to_entities(tags):
|
||||
entities = []
|
||||
start = None
|
||||
for i, tag in enumerate(tags):
|
||||
if tag is None:
|
||||
continue
|
||||
if tag.startswith("O"):
|
||||
# TODO: We shouldn't be getting these malformed inputs. Fix this.
|
||||
if start is not None:
|
||||
start = None
|
||||
continue
|
||||
elif tag == "-":
|
||||
continue
|
||||
elif tag.startswith("I"):
|
||||
if start is None:
|
||||
raise ValueError(Errors.E067.format(tags=tags[: i + 1]))
|
||||
continue
|
||||
if tag.startswith("U"):
|
||||
entities.append((tag[2:], i, i))
|
||||
elif tag.startswith("B"):
|
||||
start = i
|
||||
elif tag.startswith("L"):
|
||||
entities.append((tag[2:], start, i))
|
||||
start = None
|
||||
else:
|
||||
raise ValueError(Errors.E068.format(tag=tag))
|
||||
return entities
|
Loading…
Reference in New Issue
Block a user