mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
Merge pull request #6109 from svlandeg/feature/2rename
This commit is contained in:
commit
60a317520a
|
@ -378,7 +378,7 @@ def git_sparse_checkout(repo, subpath, dest, branch):
|
||||||
# Looking for this 'rev-list' command in the git --help? Hah.
|
# Looking for this 'rev-list' command in the git --help? Hah.
|
||||||
cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
|
cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
|
||||||
ret = run_command(cmd, capture=True)
|
ret = run_command(cmd, capture=True)
|
||||||
git_repo = _from_http_to_git(repo)
|
git_repo = _http_to_git(repo)
|
||||||
# Now pass those missings into another bit of git internals
|
# Now pass those missings into another bit of git internals
|
||||||
missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
|
missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
|
||||||
if not missings:
|
if not missings:
|
||||||
|
@ -414,7 +414,7 @@ def get_git_version(
|
||||||
return (int(version[0]), int(version[1]))
|
return (int(version[0]), int(version[1]))
|
||||||
|
|
||||||
|
|
||||||
def _from_http_to_git(repo: str) -> str:
|
def _http_to_git(repo: str) -> str:
|
||||||
if repo.startswith("http://"):
|
if repo.startswith("http://"):
|
||||||
repo = repo.replace(r"http://", r"https://")
|
repo = repo.replace(r"http://", r"https://")
|
||||||
if repo.startswith(r"https://"):
|
if repo.startswith(r"https://"):
|
||||||
|
|
|
@ -9,7 +9,7 @@ import sys
|
||||||
from ._util import app, Arg, Opt
|
from ._util import app, Arg, Opt
|
||||||
from ..training import docs_to_json
|
from ..training import docs_to_json
|
||||||
from ..tokens import DocBin
|
from ..tokens import DocBin
|
||||||
from ..training.converters import iob2docs, conll_ner2docs, json2docs, conllu2docs
|
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs, conllu_to_docs
|
||||||
|
|
||||||
|
|
||||||
# Converters are matched by file extension except for ner/iob, which are
|
# Converters are matched by file extension except for ner/iob, which are
|
||||||
|
@ -18,12 +18,12 @@ from ..training.converters import iob2docs, conll_ner2docs, json2docs, conllu2do
|
||||||
# imported from /converters.
|
# imported from /converters.
|
||||||
|
|
||||||
CONVERTERS = {
|
CONVERTERS = {
|
||||||
"conllubio": conllu2docs,
|
"conllubio": conllu_to_docs,
|
||||||
"conllu": conllu2docs,
|
"conllu": conllu_to_docs,
|
||||||
"conll": conllu2docs,
|
"conll": conllu_to_docs,
|
||||||
"ner": conll_ner2docs,
|
"ner": conll_ner_to_docs,
|
||||||
"iob": iob2docs,
|
"iob": iob_to_docs,
|
||||||
"json": json2docs,
|
"json": json_to_docs,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -69,7 +69,7 @@ class Warnings:
|
||||||
"in problems with the vocab further on in the pipeline.")
|
"in problems with the vocab further on in the pipeline.")
|
||||||
W030 = ("Some entities could not be aligned in the text \"{text}\" with "
|
W030 = ("Some entities could not be aligned in the text \"{text}\" with "
|
||||||
"entities \"{entities}\". Use "
|
"entities \"{entities}\". Use "
|
||||||
"`spacy.training.biluo_tags_from_offsets(nlp.make_doc(text), entities)`"
|
"`spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)`"
|
||||||
" to check the alignment. Misaligned entities ('-') will be "
|
" to check the alignment. Misaligned entities ('-') will be "
|
||||||
"ignored during training.")
|
"ignored during training.")
|
||||||
W033 = ("Training a new {model} using a model with no lexeme normalization "
|
W033 = ("Training a new {model} using a model with no lexeme normalization "
|
||||||
|
|
|
@ -3,7 +3,7 @@ from spacy.pipeline import Pipe
|
||||||
from spacy.matcher import PhraseMatcher, Matcher
|
from spacy.matcher import PhraseMatcher, Matcher
|
||||||
from spacy.tokens import Doc, Span, DocBin
|
from spacy.tokens import Doc, Span, DocBin
|
||||||
from spacy.training import Example, Corpus
|
from spacy.training import Example, Corpus
|
||||||
from spacy.training.converters import json2docs
|
from spacy.training.converters import json_to_docs
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.util import minibatch, ensure_path, load_model
|
from spacy.util import minibatch, ensure_path, load_model
|
||||||
|
@ -425,7 +425,7 @@ def test_issue4402():
|
||||||
attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
|
attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
|
||||||
with make_tempdir() as tmpdir:
|
with make_tempdir() as tmpdir:
|
||||||
output_file = tmpdir / "test4402.spacy"
|
output_file = tmpdir / "test4402.spacy"
|
||||||
docs = json2docs([json_data])
|
docs = json_to_docs([json_data])
|
||||||
data = DocBin(docs=docs, attrs=attrs).to_bytes()
|
data = DocBin(docs=docs, attrs=attrs).to_bytes()
|
||||||
with output_file.open("wb") as file_:
|
with output_file.open("wb") as file_:
|
||||||
file_.write(data)
|
file_.write(data)
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.tokens import Doc, Span, DocBin
|
from spacy.tokens import Doc, Span, DocBin
|
||||||
from spacy.training import Example
|
from spacy.training import Example
|
||||||
from spacy.training.converters.conllu2docs import conllu2docs
|
from spacy.training.converters.conllu_to_docs import conllu_to_docs
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.kb import KnowledgeBase
|
from spacy.kb import KnowledgeBase
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
|
@ -82,7 +82,7 @@ def test_issue4651_without_phrase_matcher_attr():
|
||||||
|
|
||||||
def test_issue4665():
|
def test_issue4665():
|
||||||
"""
|
"""
|
||||||
conllu2json should not raise an exception if the HEAD column contains an
|
conllu_to_docs should not raise an exception if the HEAD column contains an
|
||||||
underscore
|
underscore
|
||||||
"""
|
"""
|
||||||
input_data = """
|
input_data = """
|
||||||
|
@ -105,7 +105,7 @@ def test_issue4665():
|
||||||
17 . _ PUNCT . _ _ punct _ _
|
17 . _ PUNCT . _ _ punct _ _
|
||||||
18 ] _ PUNCT -RRB- _ _ punct _ _
|
18 ] _ PUNCT -RRB- _ _ punct _ _
|
||||||
"""
|
"""
|
||||||
conllu2docs(input_data)
|
conllu_to_docs(input_data)
|
||||||
|
|
||||||
|
|
||||||
def test_issue4674():
|
def test_issue4674():
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import pytest
|
import pytest
|
||||||
from click import NoSuchOption
|
from click import NoSuchOption
|
||||||
from spacy.training import docs_to_json, biluo_tags_from_offsets
|
from spacy.training import docs_to_json, offsets_to_biluo_tags
|
||||||
from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs
|
from spacy.training.converters import iob_to_docs, conll_ner_to_docs, conllu_to_docs
|
||||||
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
|
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
|
||||||
from spacy.cli.init_config import init_config, RECOMMENDATIONS
|
from spacy.cli.init_config import init_config, RECOMMENDATIONS
|
||||||
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
||||||
|
@ -15,7 +15,7 @@ import os
|
||||||
from .util import make_tempdir
|
from .util import make_tempdir
|
||||||
|
|
||||||
|
|
||||||
def test_cli_converters_conllu2json():
|
def test_cli_converters_conllu_to_docs():
|
||||||
# from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu
|
# from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu
|
||||||
lines = [
|
lines = [
|
||||||
"1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO",
|
"1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO",
|
||||||
|
@ -24,7 +24,7 @@ def test_cli_converters_conllu2json():
|
||||||
"4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tO",
|
"4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tO",
|
||||||
]
|
]
|
||||||
input_data = "\n".join(lines)
|
input_data = "\n".join(lines)
|
||||||
converted_docs = conllu2docs(input_data, n_sents=1)
|
converted_docs = conllu_to_docs(input_data, n_sents=1)
|
||||||
assert len(converted_docs) == 1
|
assert len(converted_docs) == 1
|
||||||
converted = [docs_to_json(converted_docs)]
|
converted = [docs_to_json(converted_docs)]
|
||||||
assert converted[0]["id"] == 0
|
assert converted[0]["id"] == 0
|
||||||
|
@ -40,7 +40,7 @@ def test_cli_converters_conllu2json():
|
||||||
ent_offsets = [
|
ent_offsets = [
|
||||||
(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]
|
(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]
|
||||||
]
|
]
|
||||||
biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
|
biluo_tags = offsets_to_biluo_tags(converted_docs[0], ent_offsets, missing="O")
|
||||||
assert biluo_tags == ["O", "B-PER", "L-PER", "O"]
|
assert biluo_tags == ["O", "B-PER", "L-PER", "O"]
|
||||||
|
|
||||||
|
|
||||||
|
@ -63,9 +63,9 @@ def test_cli_converters_conllu2json():
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_cli_converters_conllu2json_name_ner_map(lines):
|
def test_cli_converters_conllu_to_docs_name_ner_map(lines):
|
||||||
input_data = "\n".join(lines)
|
input_data = "\n".join(lines)
|
||||||
converted_docs = conllu2docs(
|
converted_docs = conllu_to_docs(
|
||||||
input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""}
|
input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""}
|
||||||
)
|
)
|
||||||
assert len(converted_docs) == 1
|
assert len(converted_docs) == 1
|
||||||
|
@ -84,11 +84,11 @@ def test_cli_converters_conllu2json_name_ner_map(lines):
|
||||||
ent_offsets = [
|
ent_offsets = [
|
||||||
(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]
|
(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]
|
||||||
]
|
]
|
||||||
biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
|
biluo_tags = offsets_to_biluo_tags(converted_docs[0], ent_offsets, missing="O")
|
||||||
assert biluo_tags == ["O", "B-PERSON", "L-PERSON", "O", "O"]
|
assert biluo_tags == ["O", "B-PERSON", "L-PERSON", "O", "O"]
|
||||||
|
|
||||||
|
|
||||||
def test_cli_converters_conllu2json_subtokens():
|
def test_cli_converters_conllu_to_docs_subtokens():
|
||||||
# https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu
|
# https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu
|
||||||
lines = [
|
lines = [
|
||||||
"1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O",
|
"1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O",
|
||||||
|
@ -99,7 +99,7 @@ def test_cli_converters_conllu2json_subtokens():
|
||||||
"5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O",
|
"5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O",
|
||||||
]
|
]
|
||||||
input_data = "\n".join(lines)
|
input_data = "\n".join(lines)
|
||||||
converted_docs = conllu2docs(
|
converted_docs = conllu_to_docs(
|
||||||
input_data, n_sents=1, merge_subtokens=True, append_morphology=True
|
input_data, n_sents=1, merge_subtokens=True, append_morphology=True
|
||||||
)
|
)
|
||||||
assert len(converted_docs) == 1
|
assert len(converted_docs) == 1
|
||||||
|
@ -133,11 +133,11 @@ def test_cli_converters_conllu2json_subtokens():
|
||||||
ent_offsets = [
|
ent_offsets = [
|
||||||
(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]
|
(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]
|
||||||
]
|
]
|
||||||
biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
|
biluo_tags = offsets_to_biluo_tags(converted_docs[0], ent_offsets, missing="O")
|
||||||
assert biluo_tags == ["O", "U-PER", "O", "O"]
|
assert biluo_tags == ["O", "U-PER", "O", "O"]
|
||||||
|
|
||||||
|
|
||||||
def test_cli_converters_iob2json():
|
def test_cli_converters_iob_to_docs():
|
||||||
lines = [
|
lines = [
|
||||||
"I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
|
"I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
|
||||||
"I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
|
"I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
|
||||||
|
@ -145,7 +145,7 @@ def test_cli_converters_iob2json():
|
||||||
"I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O",
|
"I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O",
|
||||||
]
|
]
|
||||||
input_data = "\n".join(lines)
|
input_data = "\n".join(lines)
|
||||||
converted_docs = iob2docs(input_data, n_sents=10)
|
converted_docs = iob_to_docs(input_data, n_sents=10)
|
||||||
assert len(converted_docs) == 1
|
assert len(converted_docs) == 1
|
||||||
converted = docs_to_json(converted_docs)
|
converted = docs_to_json(converted_docs)
|
||||||
assert converted["id"] == 0
|
assert converted["id"] == 0
|
||||||
|
@ -162,7 +162,7 @@ def test_cli_converters_iob2json():
|
||||||
assert ent.text in ["New York City", "London"]
|
assert ent.text in ["New York City", "London"]
|
||||||
|
|
||||||
|
|
||||||
def test_cli_converters_conll_ner2json():
|
def test_cli_converters_conll_ner_to_docs():
|
||||||
lines = [
|
lines = [
|
||||||
"-DOCSTART- -X- O O",
|
"-DOCSTART- -X- O O",
|
||||||
"",
|
"",
|
||||||
|
@ -212,7 +212,7 @@ def test_cli_converters_conll_ner2json():
|
||||||
".\t.\t_\tO",
|
".\t.\t_\tO",
|
||||||
]
|
]
|
||||||
input_data = "\n".join(lines)
|
input_data = "\n".join(lines)
|
||||||
converted_docs = conll_ner2docs(input_data, n_sents=10)
|
converted_docs = conll_ner_to_docs(input_data, n_sents=10)
|
||||||
assert len(converted_docs) == 1
|
assert len(converted_docs) == 1
|
||||||
converted = docs_to_json(converted_docs)
|
converted = docs_to_json(converted_docs)
|
||||||
assert converted["id"] == 0
|
assert converted["id"] == 0
|
||||||
|
|
|
@ -2,7 +2,7 @@ from numpy.testing import assert_almost_equal, assert_array_almost_equal
|
||||||
import pytest
|
import pytest
|
||||||
from pytest import approx
|
from pytest import approx
|
||||||
from spacy.training import Example
|
from spacy.training import Example
|
||||||
from spacy.training.iob_utils import biluo_tags_from_offsets
|
from spacy.training.iob_utils import offsets_to_biluo_tags
|
||||||
from spacy.scorer import Scorer, ROCAUCScore
|
from spacy.scorer import Scorer, ROCAUCScore
|
||||||
from spacy.scorer import _roc_auc_score, _roc_curve
|
from spacy.scorer import _roc_auc_score, _roc_curve
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
@ -186,7 +186,7 @@ def test_ner_per_type(en_vocab):
|
||||||
words=input_.split(" "),
|
words=input_.split(" "),
|
||||||
ents=[("CARDINAL", 0, 1), ("CARDINAL", 2, 3)],
|
ents=[("CARDINAL", 0, 1), ("CARDINAL", 2, 3)],
|
||||||
)
|
)
|
||||||
entities = biluo_tags_from_offsets(doc, annot["entities"])
|
entities = offsets_to_biluo_tags(doc, annot["entities"])
|
||||||
example = Example.from_dict(doc, {"entities": entities})
|
example = Example.from_dict(doc, {"entities": entities})
|
||||||
# a hack for sentence boundaries
|
# a hack for sentence boundaries
|
||||||
example.predicted[1].is_sent_start = False
|
example.predicted[1].is_sent_start = False
|
||||||
|
@ -211,7 +211,7 @@ def test_ner_per_type(en_vocab):
|
||||||
words=input_.split(" "),
|
words=input_.split(" "),
|
||||||
ents=[("ORG", 0, 1), ("GPE", 5, 6), ("ORG", 6, 7)],
|
ents=[("ORG", 0, 1), ("GPE", 5, 6), ("ORG", 6, 7)],
|
||||||
)
|
)
|
||||||
entities = biluo_tags_from_offsets(doc, annot["entities"])
|
entities = offsets_to_biluo_tags(doc, annot["entities"])
|
||||||
example = Example.from_dict(doc, {"entities": entities})
|
example = Example.from_dict(doc, {"entities": entities})
|
||||||
# a hack for sentence boundaries
|
# a hack for sentence boundaries
|
||||||
example.predicted[1].is_sent_start = False
|
example.predicted[1].is_sent_start = False
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
import numpy
|
import numpy
|
||||||
from spacy.training import biluo_tags_from_offsets, offsets_from_biluo_tags, Alignment
|
from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, Alignment
|
||||||
from spacy.training import spans_from_biluo_tags, iob_to_biluo
|
from spacy.training import biluo_tags_to_spans, iob_to_biluo
|
||||||
from spacy.training import Corpus, docs_to_json
|
from spacy.training import Corpus, docs_to_json
|
||||||
from spacy.training.example import Example
|
from spacy.training.example import Example
|
||||||
from spacy.training.converters import json2docs
|
from spacy.training.converters import json_to_docs
|
||||||
from spacy.training.augment import make_orth_variants_example
|
from spacy.training.augment import make_orth_variants_example
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.tokens import Doc, DocBin
|
from spacy.tokens import Doc, DocBin
|
||||||
|
@ -69,7 +69,7 @@ def test_gold_biluo_U(en_vocab):
|
||||||
spaces = [True, True, True, False, True]
|
spaces = [True, True, True, False, True]
|
||||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||||
entities = [(len("I flew to "), len("I flew to London"), "LOC")]
|
entities = [(len("I flew to "), len("I flew to London"), "LOC")]
|
||||||
tags = biluo_tags_from_offsets(doc, entities)
|
tags = offsets_to_biluo_tags(doc, entities)
|
||||||
assert tags == ["O", "O", "O", "U-LOC", "O"]
|
assert tags == ["O", "O", "O", "U-LOC", "O"]
|
||||||
|
|
||||||
|
|
||||||
|
@ -78,7 +78,7 @@ def test_gold_biluo_BL(en_vocab):
|
||||||
spaces = [True, True, True, True, False, True]
|
spaces = [True, True, True, True, False, True]
|
||||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||||
entities = [(len("I flew to "), len("I flew to San Francisco"), "LOC")]
|
entities = [(len("I flew to "), len("I flew to San Francisco"), "LOC")]
|
||||||
tags = biluo_tags_from_offsets(doc, entities)
|
tags = offsets_to_biluo_tags(doc, entities)
|
||||||
assert tags == ["O", "O", "O", "B-LOC", "L-LOC", "O"]
|
assert tags == ["O", "O", "O", "B-LOC", "L-LOC", "O"]
|
||||||
|
|
||||||
|
|
||||||
|
@ -87,7 +87,7 @@ def test_gold_biluo_BIL(en_vocab):
|
||||||
spaces = [True, True, True, True, True, False, True]
|
spaces = [True, True, True, True, True, False, True]
|
||||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||||
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
|
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
|
||||||
tags = biluo_tags_from_offsets(doc, entities)
|
tags = offsets_to_biluo_tags(doc, entities)
|
||||||
assert tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
|
assert tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
|
||||||
|
|
||||||
|
|
||||||
|
@ -100,7 +100,7 @@ def test_gold_biluo_overlap(en_vocab):
|
||||||
(len("I flew to "), len("I flew to San Francisco"), "LOC"),
|
(len("I flew to "), len("I flew to San Francisco"), "LOC"),
|
||||||
]
|
]
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
biluo_tags_from_offsets(doc, entities)
|
offsets_to_biluo_tags(doc, entities)
|
||||||
|
|
||||||
|
|
||||||
def test_gold_biluo_misalign(en_vocab):
|
def test_gold_biluo_misalign(en_vocab):
|
||||||
|
@ -109,7 +109,7 @@ def test_gold_biluo_misalign(en_vocab):
|
||||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||||
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
|
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
|
||||||
with pytest.warns(UserWarning):
|
with pytest.warns(UserWarning):
|
||||||
tags = biluo_tags_from_offsets(doc, entities)
|
tags = offsets_to_biluo_tags(doc, entities)
|
||||||
assert tags == ["O", "O", "O", "-", "-", "-"]
|
assert tags == ["O", "O", "O", "-", "-", "-"]
|
||||||
|
|
||||||
|
|
||||||
|
@ -155,7 +155,7 @@ def test_example_from_dict_some_ner(en_vocab):
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
def test_json2docs_no_ner(en_vocab):
|
def test_json_to_docs_no_ner(en_vocab):
|
||||||
data = [
|
data = [
|
||||||
{
|
{
|
||||||
"id": 1,
|
"id": 1,
|
||||||
|
@ -191,7 +191,7 @@ def test_json2docs_no_ner(en_vocab):
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
docs = json2docs(data)
|
docs = json_to_docs(data)
|
||||||
assert len(docs) == 1
|
assert len(docs) == 1
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
assert not doc.has_annotation("ENT_IOB")
|
assert not doc.has_annotation("ENT_IOB")
|
||||||
|
@ -358,9 +358,9 @@ def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
|
||||||
biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
|
biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
|
||||||
offsets = [(10, 24, "LOC"), (29, 35, "GPE")]
|
offsets = [(10, 24, "LOC"), (29, 35, "GPE")]
|
||||||
doc = en_tokenizer(text)
|
doc = en_tokenizer(text)
|
||||||
biluo_tags_converted = biluo_tags_from_offsets(doc, offsets)
|
biluo_tags_converted = offsets_to_biluo_tags(doc, offsets)
|
||||||
assert biluo_tags_converted == biluo_tags
|
assert biluo_tags_converted == biluo_tags
|
||||||
offsets_converted = offsets_from_biluo_tags(doc, biluo_tags)
|
offsets_converted = biluo_tags_to_offsets(doc, biluo_tags)
|
||||||
offsets_converted = [ent for ent in offsets if ent[2]]
|
offsets_converted = [ent for ent in offsets if ent[2]]
|
||||||
assert offsets_converted == offsets
|
assert offsets_converted == offsets
|
||||||
|
|
||||||
|
@ -368,7 +368,7 @@ def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
|
||||||
def test_biluo_spans(en_tokenizer):
|
def test_biluo_spans(en_tokenizer):
|
||||||
doc = en_tokenizer("I flew to Silicon Valley via London.")
|
doc = en_tokenizer("I flew to Silicon Valley via London.")
|
||||||
biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
|
biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
|
||||||
spans = spans_from_biluo_tags(doc, biluo_tags)
|
spans = biluo_tags_to_spans(doc, biluo_tags)
|
||||||
spans = [span for span in spans if span.label_]
|
spans = [span for span in spans if span.label_]
|
||||||
assert len(spans) == 2
|
assert len(spans) == 2
|
||||||
assert spans[0].text == "Silicon Valley"
|
assert spans[0].text == "Silicon Valley"
|
||||||
|
|
|
@ -2,8 +2,8 @@ from .corpus import Corpus # noqa: F401
|
||||||
from .example import Example, validate_examples # noqa: F401
|
from .example import Example, validate_examples # noqa: F401
|
||||||
from .align import Alignment # noqa: F401
|
from .align import Alignment # noqa: F401
|
||||||
from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401
|
from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401
|
||||||
from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags # noqa: F401
|
from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets # noqa: F401
|
||||||
from .iob_utils import spans_from_biluo_tags, tags_to_entities # noqa: F401
|
from .iob_utils import biluo_tags_to_spans, tags_to_entities # noqa: F401
|
||||||
from .gold_io import docs_to_json, read_json_file # noqa: F401
|
from .gold_io import docs_to_json, read_json_file # noqa: F401
|
||||||
from .batchers import minibatch_by_padded_size, minibatch_by_words # noqa: F401
|
from .batchers import minibatch_by_padded_size, minibatch_by_words # noqa: F401
|
||||||
from .loggers import console_logger, wandb_logger # noqa: F401
|
from .loggers import console_logger, wandb_logger # noqa: F401
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from .iob2docs import iob2docs # noqa: F401
|
from .iob_to_docs import iob_to_docs # noqa: F401
|
||||||
from .conll_ner2docs import conll_ner2docs # noqa: F401
|
from .conll_ner_to_docs import conll_ner_to_docs # noqa: F401
|
||||||
from .json2docs import json2docs # noqa: F401
|
from .json_to_docs import json_to_docs # noqa: F401
|
||||||
from .conllu2docs import conllu2docs # noqa: F401
|
from .conllu_to_docs import conllu_to_docs # noqa: F401
|
||||||
|
|
|
@ -7,7 +7,7 @@ from ...tokens import Doc, Span
|
||||||
from ...util import load_model
|
from ...util import load_model
|
||||||
|
|
||||||
|
|
||||||
def conll_ner2docs(
|
def conll_ner_to_docs(
|
||||||
input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs
|
input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs
|
||||||
):
|
):
|
||||||
"""
|
"""
|
|
@ -1,13 +1,13 @@
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from .conll_ner2docs import n_sents_info
|
from .conll_ner_to_docs import n_sents_info
|
||||||
from ...training import iob_to_biluo, spans_from_biluo_tags
|
from ...training import iob_to_biluo, biluo_tags_to_spans
|
||||||
from ...tokens import Doc, Token, Span
|
from ...tokens import Doc, Token, Span
|
||||||
from ...vocab import Vocab
|
from ...vocab import Vocab
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
|
||||||
|
|
||||||
def conllu2docs(
|
def conllu_to_docs(
|
||||||
input_data,
|
input_data,
|
||||||
n_sents=10,
|
n_sents=10,
|
||||||
append_morphology=False,
|
append_morphology=False,
|
||||||
|
@ -78,7 +78,7 @@ def read_conllx(
|
||||||
if lines:
|
if lines:
|
||||||
while lines[0].startswith("#"):
|
while lines[0].startswith("#"):
|
||||||
lines.pop(0)
|
lines.pop(0)
|
||||||
doc = doc_from_conllu_sentence(
|
doc = conllu_sentence_to_doc(
|
||||||
vocab,
|
vocab,
|
||||||
lines,
|
lines,
|
||||||
ner_tag_pattern,
|
ner_tag_pattern,
|
||||||
|
@ -128,7 +128,7 @@ def get_entities(lines, tag_pattern, ner_map=None):
|
||||||
return iob_to_biluo(iob)
|
return iob_to_biluo(iob)
|
||||||
|
|
||||||
|
|
||||||
def doc_from_conllu_sentence(
|
def conllu_sentence_to_doc(
|
||||||
vocab,
|
vocab,
|
||||||
lines,
|
lines,
|
||||||
ner_tag_pattern,
|
ner_tag_pattern,
|
||||||
|
@ -215,7 +215,7 @@ def doc_from_conllu_sentence(
|
||||||
doc[i]._.merged_lemma = lemmas[i]
|
doc[i]._.merged_lemma = lemmas[i]
|
||||||
doc[i]._.merged_spaceafter = spaces[i]
|
doc[i]._.merged_spaceafter = spaces[i]
|
||||||
ents = get_entities(lines, ner_tag_pattern, ner_map)
|
ents = get_entities(lines, ner_tag_pattern, ner_map)
|
||||||
doc.ents = spans_from_biluo_tags(doc, ents)
|
doc.ents = biluo_tags_to_spans(doc, ents)
|
||||||
|
|
||||||
if merge_subtokens:
|
if merge_subtokens:
|
||||||
doc = merge_conllu_subtokens(lines, doc)
|
doc = merge_conllu_subtokens(lines, doc)
|
|
@ -1,13 +1,13 @@
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
|
||||||
from .conll_ner2docs import n_sents_info
|
from .conll_ner_to_docs import n_sents_info
|
||||||
from ...vocab import Vocab
|
from ...vocab import Vocab
|
||||||
from ...training import iob_to_biluo, tags_to_entities
|
from ...training import iob_to_biluo, tags_to_entities
|
||||||
from ...tokens import Doc, Span
|
from ...tokens import Doc, Span
|
||||||
from ...util import minibatch
|
from ...util import minibatch
|
||||||
|
|
||||||
|
|
||||||
def iob2docs(input_data, n_sents=10, no_print=False, *args, **kwargs):
|
def iob_to_docs(input_data, n_sents=10, no_print=False, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
Convert IOB files with one sentence per line and tags separated with '|'
|
Convert IOB files with one sentence per line and tags separated with '|'
|
||||||
into Doc objects so they can be saved. IOB and IOB2 are accepted.
|
into Doc objects so they can be saved. IOB and IOB2 are accepted.
|
|
@ -1,12 +1,12 @@
|
||||||
import srsly
|
import srsly
|
||||||
from ..gold_io import json_iterate, json_to_annotations
|
from ..gold_io import json_iterate, json_to_annotations
|
||||||
from ..example import annotations2doc
|
from ..example import annotations_to_doc
|
||||||
from ..example import _fix_legacy_dict_data, _parse_example_dict_data
|
from ..example import _fix_legacy_dict_data, _parse_example_dict_data
|
||||||
from ...util import load_model
|
from ...util import load_model
|
||||||
from ...lang.xx import MultiLanguage
|
from ...lang.xx import MultiLanguage
|
||||||
|
|
||||||
|
|
||||||
def json2docs(input_data, model=None, **kwargs):
|
def json_to_docs(input_data, model=None, **kwargs):
|
||||||
nlp = load_model(model) if model is not None else MultiLanguage()
|
nlp = load_model(model) if model is not None else MultiLanguage()
|
||||||
if not isinstance(input_data, bytes):
|
if not isinstance(input_data, bytes):
|
||||||
if not isinstance(input_data, str):
|
if not isinstance(input_data, str):
|
||||||
|
@ -17,6 +17,6 @@ def json2docs(input_data, model=None, **kwargs):
|
||||||
for json_para in json_to_annotations(json_doc):
|
for json_para in json_to_annotations(json_doc):
|
||||||
example_dict = _fix_legacy_dict_data(json_para)
|
example_dict = _fix_legacy_dict_data(json_para)
|
||||||
tok_dict, doc_dict = _parse_example_dict_data(example_dict)
|
tok_dict, doc_dict = _parse_example_dict_data(example_dict)
|
||||||
doc = annotations2doc(nlp.vocab, tok_dict, doc_dict)
|
doc = annotations_to_doc(nlp.vocab, tok_dict, doc_dict)
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
return docs
|
return docs
|
|
@ -7,13 +7,13 @@ from ..tokens.span cimport Span
|
||||||
from ..tokens.span import Span
|
from ..tokens.span import Span
|
||||||
from ..attrs import IDS
|
from ..attrs import IDS
|
||||||
from .align import Alignment
|
from .align import Alignment
|
||||||
from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc
|
from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags
|
||||||
from .iob_utils import spans_from_biluo_tags
|
from .iob_utils import biluo_tags_to_spans
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from ..pipeline._parser_internals import nonproj
|
from ..pipeline._parser_internals import nonproj
|
||||||
|
|
||||||
|
|
||||||
cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
|
cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot):
|
||||||
""" Create a Doc from dictionaries with token and doc annotations. """
|
""" Create a Doc from dictionaries with token and doc annotations. """
|
||||||
attrs, array = _annot2array(vocab, tok_annot, doc_annot)
|
attrs, array = _annot2array(vocab, tok_annot, doc_annot)
|
||||||
output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"])
|
output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"])
|
||||||
|
@ -92,7 +92,7 @@ cdef class Example:
|
||||||
tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
|
tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
|
||||||
return Example(
|
return Example(
|
||||||
predicted,
|
predicted,
|
||||||
annotations2doc(predicted.vocab, tok_dict, doc_dict)
|
annotations_to_doc(predicted.vocab, tok_dict, doc_dict)
|
||||||
)
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -176,7 +176,7 @@ cdef class Example:
|
||||||
return [None] * len(self.x) # should this be 'missing' instead of 'None' ?
|
return [None] * len(self.x) # should this be 'missing' instead of 'None' ?
|
||||||
x_ents = self.get_aligned_spans_y2x(self.y.ents)
|
x_ents = self.get_aligned_spans_y2x(self.y.ents)
|
||||||
# Default to 'None' for missing values
|
# Default to 'None' for missing values
|
||||||
x_tags = biluo_tags_from_offsets(
|
x_tags = offsets_to_biluo_tags(
|
||||||
self.x,
|
self.x,
|
||||||
[(e.start_char, e.end_char, e.label_) for e in x_ents],
|
[(e.start_char, e.end_char, e.label_) for e in x_ents],
|
||||||
missing=None
|
missing=None
|
||||||
|
@ -195,7 +195,7 @@ cdef class Example:
|
||||||
return {
|
return {
|
||||||
"doc_annotation": {
|
"doc_annotation": {
|
||||||
"cats": dict(self.reference.cats),
|
"cats": dict(self.reference.cats),
|
||||||
"entities": biluo_tags_from_doc(self.reference),
|
"entities": doc_to_biluo_tags(self.reference),
|
||||||
"links": self._links_to_dict()
|
"links": self._links_to_dict()
|
||||||
},
|
},
|
||||||
"token_annotation": {
|
"token_annotation": {
|
||||||
|
@ -295,12 +295,12 @@ def _add_entities_to_doc(doc, ner_data):
|
||||||
elif isinstance(ner_data[0], tuple):
|
elif isinstance(ner_data[0], tuple):
|
||||||
return _add_entities_to_doc(
|
return _add_entities_to_doc(
|
||||||
doc,
|
doc,
|
||||||
biluo_tags_from_offsets(doc, ner_data)
|
offsets_to_biluo_tags(doc, ner_data)
|
||||||
)
|
)
|
||||||
elif isinstance(ner_data[0], str) or ner_data[0] is None:
|
elif isinstance(ner_data[0], str) or ner_data[0] is None:
|
||||||
return _add_entities_to_doc(
|
return _add_entities_to_doc(
|
||||||
doc,
|
doc,
|
||||||
spans_from_biluo_tags(doc, ner_data)
|
biluo_tags_to_spans(doc, ner_data)
|
||||||
)
|
)
|
||||||
elif isinstance(ner_data[0], Span):
|
elif isinstance(ner_data[0], Span):
|
||||||
# Ugh, this is super messy. Really hard to set O entities
|
# Ugh, this is super messy. Really hard to set O entities
|
||||||
|
@ -388,7 +388,7 @@ def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces):
|
||||||
# This is annoying but to convert the offsets we need a Doc
|
# This is annoying but to convert the offsets we need a Doc
|
||||||
# that has the target tokenization.
|
# that has the target tokenization.
|
||||||
reference = Doc(vocab, words=words, spaces=spaces)
|
reference = Doc(vocab, words=words, spaces=spaces)
|
||||||
biluo = biluo_tags_from_offsets(reference, biluo_or_offsets)
|
biluo = offsets_to_biluo_tags(reference, biluo_or_offsets)
|
||||||
else:
|
else:
|
||||||
biluo = biluo_or_offsets
|
biluo = biluo_or_offsets
|
||||||
ent_iobs = []
|
ent_iobs = []
|
||||||
|
|
|
@ -3,7 +3,7 @@ import srsly
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..errors import Warnings
|
from ..errors import Warnings
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from .iob_utils import biluo_tags_from_offsets, tags_to_entities
|
from .iob_utils import offsets_to_biluo_tags, tags_to_entities
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
@ -32,7 +32,7 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
|
||||||
if ent.kb_id_:
|
if ent.kb_id_:
|
||||||
link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
|
link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
|
||||||
json_para["links"].append(link_dict)
|
json_para["links"].append(link_dict)
|
||||||
biluo_tags = biluo_tags_from_offsets(doc, json_para["entities"], missing=ner_missing_tag)
|
biluo_tags = offsets_to_biluo_tags(doc, json_para["entities"], missing=ner_missing_tag)
|
||||||
attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "ENT_IOB")
|
attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "ENT_IOB")
|
||||||
include_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
|
include_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
|
||||||
for j, sent in enumerate(doc.sents):
|
for j, sent in enumerate(doc.sents):
|
||||||
|
|
|
@ -51,7 +51,11 @@ def _consume_ent(tags):
|
||||||
|
|
||||||
|
|
||||||
def biluo_tags_from_doc(doc, missing="O"):
|
def biluo_tags_from_doc(doc, missing="O"):
|
||||||
return biluo_tags_from_offsets(
|
return doc_to_biluo_tags(doc, missing)
|
||||||
|
|
||||||
|
|
||||||
|
def doc_to_biluo_tags(doc, missing="O"):
|
||||||
|
return offsets_to_biluo_tags(
|
||||||
doc,
|
doc,
|
||||||
[(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents],
|
[(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents],
|
||||||
missing=missing,
|
missing=missing,
|
||||||
|
@ -59,6 +63,10 @@ def biluo_tags_from_doc(doc, missing="O"):
|
||||||
|
|
||||||
|
|
||||||
def biluo_tags_from_offsets(doc, entities, missing="O"):
|
def biluo_tags_from_offsets(doc, entities, missing="O"):
|
||||||
|
return offsets_to_biluo_tags(doc, entities, missing)
|
||||||
|
|
||||||
|
|
||||||
|
def offsets_to_biluo_tags(doc, entities, missing="O"):
|
||||||
"""Encode labelled spans into per-token tags, using the
|
"""Encode labelled spans into per-token tags, using the
|
||||||
Begin/In/Last/Unit/Out scheme (BILUO).
|
Begin/In/Last/Unit/Out scheme (BILUO).
|
||||||
|
|
||||||
|
@ -80,7 +88,7 @@ def biluo_tags_from_offsets(doc, entities, missing="O"):
|
||||||
>>> text = 'I like London.'
|
>>> text = 'I like London.'
|
||||||
>>> entities = [(len('I like '), len('I like London'), 'LOC')]
|
>>> entities = [(len('I like '), len('I like London'), 'LOC')]
|
||||||
>>> doc = nlp.tokenizer(text)
|
>>> doc = nlp.tokenizer(text)
|
||||||
>>> tags = biluo_tags_from_offsets(doc, entities)
|
>>> tags = offsets_to_biluo_tags(doc, entities)
|
||||||
>>> assert tags == ["O", "O", 'U-LOC', "O"]
|
>>> assert tags == ["O", "O", 'U-LOC', "O"]
|
||||||
"""
|
"""
|
||||||
# Ensure no overlapping entity labels exist
|
# Ensure no overlapping entity labels exist
|
||||||
|
@ -144,6 +152,10 @@ def biluo_tags_from_offsets(doc, entities, missing="O"):
|
||||||
|
|
||||||
|
|
||||||
def spans_from_biluo_tags(doc, tags):
|
def spans_from_biluo_tags(doc, tags):
|
||||||
|
return biluo_tags_to_spans(doc, tags)
|
||||||
|
|
||||||
|
|
||||||
|
def biluo_tags_to_spans(doc, tags):
|
||||||
"""Encode per-token tags following the BILUO scheme into Span object, e.g.
|
"""Encode per-token tags following the BILUO scheme into Span object, e.g.
|
||||||
to overwrite the doc.ents.
|
to overwrite the doc.ents.
|
||||||
|
|
||||||
|
@ -162,6 +174,10 @@ def spans_from_biluo_tags(doc, tags):
|
||||||
|
|
||||||
|
|
||||||
def offsets_from_biluo_tags(doc, tags):
|
def offsets_from_biluo_tags(doc, tags):
|
||||||
|
return biluo_tags_to_offsets(doc, tags)
|
||||||
|
|
||||||
|
|
||||||
|
def biluo_tags_to_offsets(doc, tags):
|
||||||
"""Encode per-token tags following the BILUO scheme into entity offsets.
|
"""Encode per-token tags following the BILUO scheme into entity offsets.
|
||||||
|
|
||||||
doc (Doc): The document that the BILUO tags refer to.
|
doc (Doc): The document that the BILUO tags refer to.
|
||||||
|
@ -172,7 +188,7 @@ def offsets_from_biluo_tags(doc, tags):
|
||||||
`end` will be character-offset integers denoting the slice into the
|
`end` will be character-offset integers denoting the slice into the
|
||||||
original string.
|
original string.
|
||||||
"""
|
"""
|
||||||
spans = spans_from_biluo_tags(doc, tags)
|
spans = biluo_tags_to_spans(doc, tags)
|
||||||
return [(span.start_char, span.end_char, span.label_) for span in spans]
|
return [(span.start_char, span.end_char, span.label_) for span in spans]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -275,7 +275,7 @@ $ python -m spacy convert ./data.json ./output.spacy
|
||||||
> entity label, prefixed by the BILUO marker. For example `"B-ORG"` describes
|
> entity label, prefixed by the BILUO marker. For example `"B-ORG"` describes
|
||||||
> the first token of a multi-token `ORG` entity and `"U-PERSON"` a single token
|
> the first token of a multi-token `ORG` entity and `"U-PERSON"` a single token
|
||||||
> representing a `PERSON` entity. The
|
> representing a `PERSON` entity. The
|
||||||
> [`biluo_tags_from_offsets`](/api/top-level#biluo_tags_from_offsets) function
|
> [`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) function
|
||||||
> can help you convert entity offsets to the right format.
|
> can help you convert entity offsets to the right format.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
|
|
@ -619,7 +619,7 @@ sequences in the batch.
|
||||||
|
|
||||||
## Training data and alignment {#gold source="spacy/training"}
|
## Training data and alignment {#gold source="spacy/training"}
|
||||||
|
|
||||||
### training.biluo_tags_from_offsets {#biluo_tags_from_offsets tag="function"}
|
### training.offsets_to_biluo_tags {#offsets_to_biluo_tags tag="function"}
|
||||||
|
|
||||||
Encode labelled spans into per-token tags, using the
|
Encode labelled spans into per-token tags, using the
|
||||||
[BILUO scheme](/usage/linguistic-features#accessing-ner) (Begin, In, Last, Unit,
|
[BILUO scheme](/usage/linguistic-features#accessing-ner) (Begin, In, Last, Unit,
|
||||||
|
@ -635,11 +635,11 @@ single-token entity.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.training import biluo_tags_from_offsets
|
> from spacy.training import offsets_to_biluo_tags
|
||||||
>
|
>
|
||||||
> doc = nlp("I like London.")
|
> doc = nlp("I like London.")
|
||||||
> entities = [(7, 13, "LOC")]
|
> entities = [(7, 13, "LOC")]
|
||||||
> tags = biluo_tags_from_offsets(doc, entities)
|
> tags = offsets_to_biluo_tags(doc, entities)
|
||||||
> assert tags == ["O", "O", "U-LOC", "O"]
|
> assert tags == ["O", "O", "U-LOC", "O"]
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
@ -649,7 +649,7 @@ single-token entity.
|
||||||
| `entities` | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, Union[str, int]]]~~ |
|
| `entities` | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, Union[str, int]]]~~ |
|
||||||
| **RETURNS** | A list of strings, describing the [BILUO](/usage/linguistic-features#accessing-ner) tags. ~~List[str]~~ |
|
| **RETURNS** | A list of strings, describing the [BILUO](/usage/linguistic-features#accessing-ner) tags. ~~List[str]~~ |
|
||||||
|
|
||||||
### training.offsets_from_biluo_tags {#offsets_from_biluo_tags tag="function"}
|
### training.biluo_tags_to_offsets {#biluo_tags_to_offsets tag="function"}
|
||||||
|
|
||||||
Encode per-token tags following the
|
Encode per-token tags following the
|
||||||
[BILUO scheme](/usage/linguistic-features#accessing-ner) into entity offsets.
|
[BILUO scheme](/usage/linguistic-features#accessing-ner) into entity offsets.
|
||||||
|
@ -657,11 +657,11 @@ Encode per-token tags following the
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.training import offsets_from_biluo_tags
|
> from spacy.training import biluo_tags_to_offsets
|
||||||
>
|
>
|
||||||
> doc = nlp("I like London.")
|
> doc = nlp("I like London.")
|
||||||
> tags = ["O", "O", "U-LOC", "O"]
|
> tags = ["O", "O", "U-LOC", "O"]
|
||||||
> entities = offsets_from_biluo_tags(doc, tags)
|
> entities = biluo_tags_to_offsets(doc, tags)
|
||||||
> assert entities == [(7, 13, "LOC")]
|
> assert entities == [(7, 13, "LOC")]
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
@ -671,7 +671,7 @@ Encode per-token tags following the
|
||||||
| `entities` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ |
|
| `entities` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ |
|
||||||
| **RETURNS** | A sequence of `(start, end, label)` triples. `start` and `end` will be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, str]]~~ |
|
| **RETURNS** | A sequence of `(start, end, label)` triples. `start` and `end` will be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, str]]~~ |
|
||||||
|
|
||||||
### training.spans_from_biluo_tags {#spans_from_biluo_tags tag="function" new="2.1"}
|
### training.biluo_tags_to_spans {#biluo_tags_to_spans tag="function" new="2.1"}
|
||||||
|
|
||||||
Encode per-token tags following the
|
Encode per-token tags following the
|
||||||
[BILUO scheme](/usage/linguistic-features#accessing-ner) into
|
[BILUO scheme](/usage/linguistic-features#accessing-ner) into
|
||||||
|
@ -681,11 +681,11 @@ token-based tags, e.g. to overwrite the `doc.ents`.
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.training import spans_from_biluo_tags
|
> from spacy.training import biluo_tags_to_spans
|
||||||
>
|
>
|
||||||
> doc = nlp("I like London.")
|
> doc = nlp("I like London.")
|
||||||
> tags = ["O", "O", "U-LOC", "O"]
|
> tags = ["O", "O", "U-LOC", "O"]
|
||||||
> doc.ents = spans_from_biluo_tags(doc, tags)
|
> doc.ents = biluo_tags_to_spans(doc, tags)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
|
|
@ -1501,7 +1501,7 @@ add those entities to the `doc.ents`, you can wrap it in a custom pipeline
|
||||||
component function and pass it the token texts from the `Doc` object received by
|
component function and pass it the token texts from the `Doc` object received by
|
||||||
the component.
|
the component.
|
||||||
|
|
||||||
The [`training.spans_from_biluo_tags`](/api/top-level#spans_from_biluo_tags) is very
|
The [`training.biluo_tags_to_spans`](/api/top-level#biluo_tags_to_spans) is very
|
||||||
helpful here, because it takes a `Doc` object and token-based BILUO tags and
|
helpful here, because it takes a `Doc` object and token-based BILUO tags and
|
||||||
returns a sequence of `Span` objects in the `Doc` with added labels. So all your
|
returns a sequence of `Span` objects in the `Doc` with added labels. So all your
|
||||||
wrapper has to do is compute the entity spans and overwrite the `doc.ents`.
|
wrapper has to do is compute the entity spans and overwrite the `doc.ents`.
|
||||||
|
@ -1516,14 +1516,14 @@ wrapper has to do is compute the entity spans and overwrite the `doc.ents`.
|
||||||
```python
|
```python
|
||||||
### {highlight="1,8-9"}
|
### {highlight="1,8-9"}
|
||||||
import your_custom_entity_recognizer
|
import your_custom_entity_recognizer
|
||||||
from spacy.training import offsets_from_biluo_tags
|
from spacy.training import biluo_tags_to_spans
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
|
|
||||||
@Language.component("custom_ner_wrapper")
|
@Language.component("custom_ner_wrapper")
|
||||||
def custom_ner_wrapper(doc):
|
def custom_ner_wrapper(doc):
|
||||||
words = [token.text for token in doc]
|
words = [token.text for token in doc]
|
||||||
custom_entities = your_custom_entity_recognizer(words)
|
custom_entities = your_custom_entity_recognizer(words)
|
||||||
doc.ents = spans_from_biluo_tags(doc, custom_entities)
|
doc.ents = biluo_tags_to_spans(doc, custom_entities)
|
||||||
return doc
|
return doc
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -971,16 +971,17 @@ python -m spacy package ./output ./packages
|
||||||
|
|
||||||
#### Data utilities and gold module {#migrating-gold}
|
#### Data utilities and gold module {#migrating-gold}
|
||||||
|
|
||||||
The `spacy.gold` module has been renamed to `spacy.training`. This mostly
|
The `spacy.gold` module has been renamed to `spacy.training` and the conversion
|
||||||
|
utilities now follow the naming format of `x_to_y`. This mostly
|
||||||
affects internals, but if you've been using the span offset conversion utilities
|
affects internals, but if you've been using the span offset conversion utilities
|
||||||
[`biluo_tags_from_offsets`](/api/top-level#biluo_tags_from_offsets),
|
[`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags),
|
||||||
[`offsets_from_biluo_tags`](/api/top-level#offsets_from_biluo_tags) or
|
[`biluo_tags_to_offsets`](/api/top-level#biluo_tags_to_offsets) or
|
||||||
[`spans_from_biluo_tags`](/api/top-level#spans_from_biluo_tags), you'll have to
|
[`biluo_tags_to_spans`](/api/top-level#biluo_tags_to_spans), you'll have to
|
||||||
change your imports:
|
change your names and imports:
|
||||||
|
|
||||||
```diff
|
```diff
|
||||||
- from spacy.gold import biluo_tags_from_offsets, spans_from_biluo_tags
|
- from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags, spans_from_biluo_tags
|
||||||
+ from spacy.training import biluo_tags_from_offsets, spans_from_biluo_tags
|
+ from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, biluo_tags_to_spans
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Migration notes for plugin maintainers {#migrating-plugins}
|
#### Migration notes for plugin maintainers {#migrating-plugins}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user