avoid empty aliases and improve UX and docs (#6840)

This commit is contained in:
Sofie Van Landeghem 2021-01-29 01:51:40 +01:00 committed by GitHub
parent 837a4f53c2
commit 24a697abb8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 35 additions and 2 deletions

View File

@ -470,6 +470,10 @@ class Errors:
"issue tracker: http://github.com/explosion/spaCy/issues")
# TODO: fix numbering after merging develop into master
E890 = ("Can not add the alias '{alias}' to the Knowledge base. "
"Each alias should be a meaningful string.")
E891 = ("Alias '{alias}' could not be added to the Knowledge base. "
"This is likely a bug in spaCy.")
E892 = ("Unknown function registry: '{name}'.\n\nAvailable names: {available}")
E893 = ("Could not find function '{name}' in function registry '{reg_name}'. "
"If you're using a custom function, make sure the code is available. "

View File

@ -187,6 +187,10 @@ cdef class KnowledgeBase:
For a given alias, add its potential entities and prior probabilies to the KB.
Return the alias_hash at the end
"""
if alias is None or len(alias) == 0:
raise ValueError(Errors.E890.format(alias=alias))
previous_alias_nr = self.get_size_aliases()
# Throw an error if the length of entities and probabilities are not the same
if not len(entities) == len(probabilities):
raise ValueError(Errors.E132.format(alias=alias,
@ -220,6 +224,8 @@ cdef class KnowledgeBase:
new_index = self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs)
self._alias_index[alias_hash] = new_index
if previous_alias_nr + 1 != self.get_size_aliases():
raise RuntimeError(Errors.E891.format(alias=alias))
return alias_hash
def append_alias(self, unicode alias, unicode entity, float prior_prob, ignore_warnings=False):

View File

@ -0,0 +1,23 @@
import pytest
from ..util import make_tempdir
def test_issue6730(en_vocab):
"""Ensure that the KB does not accept empty strings, but otherwise IO works fine."""
from spacy.kb import KnowledgeBase
kb = KnowledgeBase(en_vocab, entity_vector_length=3)
kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3])
with pytest.raises(ValueError):
kb.add_alias(alias="", entities=["1"], probabilities=[0.4])
assert kb.contains_alias("") is False
kb.add_alias(alias="x", entities=["1"], probabilities=[0.2])
kb.add_alias(alias="y", entities=["1"], probabilities=[0.1])
with make_tempdir() as tmp_dir:
kb.to_disk(tmp_dir)
kb.from_disk(tmp_dir)
assert kb.get_size_aliases() == 2
assert set(kb.get_alias_strings()) == {"x", "y"}

View File

@ -82,7 +82,7 @@ Add an alias or mention to the knowledge base, specifying its potential KB
identifiers and their prior probabilities. The entity identifiers should refer
to entities previously added with [`add_entity`](/api/kb#add_entity) or
[`set_entities`](/api/kb#set_entities). The sum of the prior probabilities
should not exceed 1.
should not exceed 1. Note that an empty string can not be used as alias.
> #### Example
>
@ -92,7 +92,7 @@ should not exceed 1.
| Name | Description |
| --------------- | --------------------------------------------------------------------------------- |
| `alias` | The textual mention or alias. ~~str~~ |
| `alias` | The textual mention or alias. Can not be the empty string. ~~str~~ |
| `entities` | The potential entities that the alias may refer to. ~~Iterable[Union[str, int]]~~ |
| `probabilities` | The prior probabilities of each entity. ~~Iterable[float]~~ |