Clarify error when words are of wrong type (#9541)

* Clarify error when words are of wrong type See #9437 * Update docs * Use try/except * Apply suggestions from code review Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
2025-07-17 11:42:30 +03:00 · 2021-10-29 10:08:40 +00:00 · 2021-10-29 10:08:40 +00:00 · 006df1ae1f
commit 006df1ae1f
parent 2fd8d616e7
3 changed files with 12 additions and 7 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -877,6 +877,7 @@ class Errors:
             "filename. Specify an epoch to resume from.")
    E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
             "Non-UD tags should use the `tag` property.")
    E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
 # Deprecated model shortcuts, only used in errors and warnings
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -194,11 +194,12 @@ cdef class Doc:
        vocab (Vocab): A vocabulary object, which must match any models you
            want to use (e.g. tokenizer, parser, entity recognizer).
-        words (Optional[List[str]]): A list of unicode strings to add to the document
+        words (Optional[List[Union[str, int]]]): A list of unicode strings or
-            as words. If `None`, defaults to empty list.
+            hash values to add to the document as words. If `None`, defaults to
-        spaces (Optional[List[bool]]): A list of boolean values, of the same length as
+            empty list.
-            words. True means that the word is followed by a space, False means
+        spaces (Optional[List[bool]]): A list of boolean values, of the same
-            it is not. If `None`, defaults to `[True]*len(words)`
+            length as `words`. `True` means that the word is followed by a space,
            `False` means it is not. If `None`, defaults to `[True]*len(words)`
        user_data (dict or None): Optional extra data to attach to the Doc.
        tags (Optional[List[str]]): A list of unicode strings, of the same
            length as words, to assign as token.tag. Defaults to None.
@ -266,7 +267,10 @@ cdef class Doc:
            elif isinstance(word, bytes):
                raise ValueError(Errors.E028.format(value=word))
            else:
                try:
                    lexeme = self.vocab.get_by_orth(self.mem, word)
                except TypeError:
                    raise TypeError(Errors.E1022.format(wtype=type(word)))
            self.push_back(lexeme, has_space)
        if heads is not None:
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@ -34,7 +34,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
 | Name                                     | Description                                                                                                                                                                                        |
 | ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `vocab`                                  | A storage container for lexical types. ~~Vocab~~                                                                                                                                                   |
-| `words`                                  | A list of strings to add to the container. ~~Optional[List[str]]~~                                                                                                                                 |
+| `words`                                  | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~                                                                                                 |
 | `spaces`                                 | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~       |
 | _keyword-only_                           |                                                                                                                                                                                                    |
 | `user\_data`                             | Optional extra data to attach to the Doc. ~~Dict~~                                                                                                                                                 |