diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 4069e018a..86c7fbf72 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -272,3 +272,9 @@ def test_doc_is_nered(en_vocab): # Test serialization new_doc = Doc(en_vocab).from_bytes(doc.to_bytes()) assert new_doc.is_nered + + +def test_doc_lang(en_vocab): + doc = Doc(en_vocab, words=["Hello", "world"]) + assert doc.lang_ == "en" + assert doc.lang == en_vocab.strings["en"] diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 4d3ed084a..857c7b538 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -597,6 +597,16 @@ cdef class Doc: if start != self.length: yield Span(self, start, self.length) + @property + def lang(self): + """RETURNS (uint64): ID of the language of the doc's vocabulary.""" + return self.vocab.strings[self.vocab.lang] + + @property + def lang_(self): + """RETURNS (unicode): Language of the doc's vocabulary, e.g. 'en'.""" + return self.vocab.lang + cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1: if self.length == 0: # Flip these to false when we see the first token. @@ -748,7 +758,7 @@ cdef class Doc: # Allow strings, e.g. 'lemma' or 'LEMMA' attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_) for id_ in attrs] - + if SENT_START in attrs and HEAD in attrs: raise ValueError(Errors.E032) cdef int i, col diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 953a31c2d..f5a94335f 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -654,6 +654,8 @@ The L2 norm of the document's vector representation. | `tensor` 2 | object | Container for dense vector representations. | | `cats` 2 | dictionary | Maps either a label to a score for categories applied to whole document, or `(start_char, end_char, label)` to score for categories applied to spans. `start_char` and `end_char` should be character offsets, label can be either a string or an integer ID, and score should be a float. | | `user_data` | - | A generic storage area, for user custom data. | +| `lang` 2.1 | int | Language of the document's vocabulary. | +| `lang_` 2.1 | unicode | Language of the document's vocabulary. | | `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. | | `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. | | `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. |