Add Doc.lang and Doc.lang_

2026-03-01 10:21:28 +03:00 · 2019-03-11 14:21:40 +01:00 · 2019-03-11 14:21:40 +01:00 · ebcf2bb1c3
commit ebcf2bb1c3
parent ef80cfde6f
3 changed files with 19 additions and 1 deletions
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -272,3 +272,9 @@ def test_doc_is_nered(en_vocab):
    # Test serialization
    new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
    assert new_doc.is_nered
+
+
+def test_doc_lang(en_vocab):
+    doc = Doc(en_vocab, words=["Hello", "world"])
+    assert doc.lang_ == "en"
+    assert doc.lang == en_vocab.strings["en"]
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -597,6 +597,16 @@ cdef class Doc:
                if start != self.length:
                    yield Span(self, start, self.length)

+    @property
+    def lang(self):
+        """RETURNS (uint64): ID of the language of the doc's vocabulary."""
+        return self.vocab.strings[self.vocab.lang]
+
+    @property
+    def lang_(self):
+        """RETURNS (unicode): Language of the doc's vocabulary, e.g. 'en'."""
+        return self.vocab.lang
+
    cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
        if self.length == 0:
            # Flip these to false when we see the first token.
@ -748,7 +758,7 @@ cdef class Doc:
        # Allow strings, e.g. 'lemma' or 'LEMMA'
        attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
                 for id_ in attrs]
- 
+
        if SENT_START in attrs and HEAD in attrs:
            raise ValueError(Errors.E032)
        cdef int i, col
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@ -654,6 +654,8 @@ The L2 norm of the document's vector representation.
 | `tensor` <Tag variant="new">2</Tag>     | object       | Container for dense vector representations.                                                                                                                                                                                                                                                |
 | `cats` <Tag variant="new">2</Tag>       | dictionary   | Maps either a label to a score for categories applied to whole document, or `(start_char, end_char, label)` to score for categories applied to spans. `start_char` and `end_char` should be character offsets, label can be either a string or an integer ID, and score should be a float. |
 | `user_data`                             | -            | A generic storage area, for user custom data.                                                                                                                                                                                                                                              |
+| `lang` <Tag variant="new">2.1</Tag>     | int          | Language of the document's vocabulary.                                                                                                                                                                                                                                                     |
+| `lang_` <Tag variant="new">2.1</Tag>    | unicode      | Language of the document's vocabulary.                                                                                                                                                                                                                                                     |
 | `is_tagged`                             | bool         | A flag indicating that the document has been part-of-speech tagged.                                                                                                                                                                                                                        |
 | `is_parsed`                             | bool         | A flag indicating that the document has been syntactically parsed.                                                                                                                                                                                                                         |
 | `is_sentenced`                          | bool         | A flag indicating that sentence boundaries have been applied to the document.                                                                                                                                                                                                              |