diff --git a/spacy/errors.py b/spacy/errors.py index e6912a263..ff1185361 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -877,6 +877,7 @@ class Errors: "filename. Specify an epoch to resume from.") E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. " "Non-UD tags should use the `tag` property.") + E1022 = ("Words must be of type str or int, but input is of type '{wtype}'") # Deprecated model shortcuts, only used in errors and warnings diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 5ea3e1e3b..1ee845934 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -194,11 +194,12 @@ cdef class Doc: vocab (Vocab): A vocabulary object, which must match any models you want to use (e.g. tokenizer, parser, entity recognizer). - words (Optional[List[str]]): A list of unicode strings to add to the document - as words. If `None`, defaults to empty list. - spaces (Optional[List[bool]]): A list of boolean values, of the same length as - words. True means that the word is followed by a space, False means - it is not. If `None`, defaults to `[True]*len(words)` + words (Optional[List[Union[str, int]]]): A list of unicode strings or + hash values to add to the document as words. If `None`, defaults to + empty list. + spaces (Optional[List[bool]]): A list of boolean values, of the same + length as `words`. `True` means that the word is followed by a space, + `False` means it is not. If `None`, defaults to `[True]*len(words)` user_data (dict or None): Optional extra data to attach to the Doc. tags (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.tag. Defaults to None. @@ -266,7 +267,10 @@ cdef class Doc: elif isinstance(word, bytes): raise ValueError(Errors.E028.format(value=word)) else: - lexeme = self.vocab.get_by_orth(self.mem, word) + try: + lexeme = self.vocab.get_by_orth(self.mem, word) + except TypeError: + raise TypeError(Errors.E1022.format(wtype=type(word))) self.push_back(lexeme, has_space) if heads is not None: diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index e1f18963b..9836b8c21 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -34,7 +34,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the | Name | Description | | ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `vocab` | A storage container for lexical types. ~~Vocab~~ | -| `words` | A list of strings to add to the container. ~~Optional[List[str]]~~ | +| `words` | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~ | | `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ | | _keyword-only_ | | | `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ |