avoid empty aliases and improve UX and docs (#6840)

This commit is contained in:
Sofie Van Landeghem 2021-01-29 01:51:40 +01:00 committed by GitHub
parent 837a4f53c2
commit 24a697abb8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 35 additions and 2 deletions

View File

@ -470,6 +470,10 @@ class Errors:
"issue tracker: http://github.com/explosion/spaCy/issues") "issue tracker: http://github.com/explosion/spaCy/issues")
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
E890 = ("Can not add the alias '{alias}' to the Knowledge base. "
"Each alias should be a meaningful string.")
E891 = ("Alias '{alias}' could not be added to the Knowledge base. "
"This is likely a bug in spaCy.")
E892 = ("Unknown function registry: '{name}'.\n\nAvailable names: {available}") E892 = ("Unknown function registry: '{name}'.\n\nAvailable names: {available}")
E893 = ("Could not find function '{name}' in function registry '{reg_name}'. " E893 = ("Could not find function '{name}' in function registry '{reg_name}'. "
"If you're using a custom function, make sure the code is available. " "If you're using a custom function, make sure the code is available. "

View File

@ -187,6 +187,10 @@ cdef class KnowledgeBase:
For a given alias, add its potential entities and prior probabilies to the KB. For a given alias, add its potential entities and prior probabilies to the KB.
Return the alias_hash at the end Return the alias_hash at the end
""" """
if alias is None or len(alias) == 0:
raise ValueError(Errors.E890.format(alias=alias))
previous_alias_nr = self.get_size_aliases()
# Throw an error if the length of entities and probabilities are not the same # Throw an error if the length of entities and probabilities are not the same
if not len(entities) == len(probabilities): if not len(entities) == len(probabilities):
raise ValueError(Errors.E132.format(alias=alias, raise ValueError(Errors.E132.format(alias=alias,
@ -220,6 +224,8 @@ cdef class KnowledgeBase:
new_index = self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs) new_index = self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs)
self._alias_index[alias_hash] = new_index self._alias_index[alias_hash] = new_index
if previous_alias_nr + 1 != self.get_size_aliases():
raise RuntimeError(Errors.E891.format(alias=alias))
return alias_hash return alias_hash
def append_alias(self, unicode alias, unicode entity, float prior_prob, ignore_warnings=False): def append_alias(self, unicode alias, unicode entity, float prior_prob, ignore_warnings=False):

View File

@ -0,0 +1,23 @@
import pytest
from ..util import make_tempdir
def test_issue6730(en_vocab):
"""Ensure that the KB does not accept empty strings, but otherwise IO works fine."""
from spacy.kb import KnowledgeBase
kb = KnowledgeBase(en_vocab, entity_vector_length=3)
kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3])
with pytest.raises(ValueError):
kb.add_alias(alias="", entities=["1"], probabilities=[0.4])
assert kb.contains_alias("") is False
kb.add_alias(alias="x", entities=["1"], probabilities=[0.2])
kb.add_alias(alias="y", entities=["1"], probabilities=[0.1])
with make_tempdir() as tmp_dir:
kb.to_disk(tmp_dir)
kb.from_disk(tmp_dir)
assert kb.get_size_aliases() == 2
assert set(kb.get_alias_strings()) == {"x", "y"}

View File

@ -82,7 +82,7 @@ Add an alias or mention to the knowledge base, specifying its potential KB
identifiers and their prior probabilities. The entity identifiers should refer identifiers and their prior probabilities. The entity identifiers should refer
to entities previously added with [`add_entity`](/api/kb#add_entity) or to entities previously added with [`add_entity`](/api/kb#add_entity) or
[`set_entities`](/api/kb#set_entities). The sum of the prior probabilities [`set_entities`](/api/kb#set_entities). The sum of the prior probabilities
should not exceed 1. should not exceed 1. Note that an empty string can not be used as alias.
> #### Example > #### Example
> >
@ -92,7 +92,7 @@ should not exceed 1.
| Name | Description | | Name | Description |
| --------------- | --------------------------------------------------------------------------------- | | --------------- | --------------------------------------------------------------------------------- |
| `alias` | The textual mention or alias. ~~str~~ | | `alias` | The textual mention or alias. Can not be the empty string. ~~str~~ |
| `entities` | The potential entities that the alias may refer to. ~~Iterable[Union[str, int]]~~ | | `entities` | The potential entities that the alias may refer to. ~~Iterable[Union[str, int]]~~ |
| `probabilities` | The prior probabilities of each entity. ~~Iterable[float]~~ | | `probabilities` | The prior probabilities of each entity. ~~Iterable[float]~~ |