Add Doc.lang and Doc.lang_

This commit is contained in:
Ines Montani 2019-03-11 14:21:40 +01:00
parent ef80cfde6f
commit ebcf2bb1c3
3 changed files with 19 additions and 1 deletions

View File

@ -272,3 +272,9 @@ def test_doc_is_nered(en_vocab):
# Test serialization
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
assert new_doc.is_nered
def test_doc_lang(en_vocab):
doc = Doc(en_vocab, words=["Hello", "world"])
assert doc.lang_ == "en"
assert doc.lang == en_vocab.strings["en"]

View File

@ -597,6 +597,16 @@ cdef class Doc:
if start != self.length:
yield Span(self, start, self.length)
@property
def lang(self):
"""RETURNS (uint64): ID of the language of the doc's vocabulary."""
return self.vocab.strings[self.vocab.lang]
@property
def lang_(self):
"""RETURNS (unicode): Language of the doc's vocabulary, e.g. 'en'."""
return self.vocab.lang
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
if self.length == 0:
# Flip these to false when we see the first token.
@ -748,7 +758,7 @@ cdef class Doc:
# Allow strings, e.g. 'lemma' or 'LEMMA'
attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
for id_ in attrs]
if SENT_START in attrs and HEAD in attrs:
raise ValueError(Errors.E032)
cdef int i, col

View File

@ -654,6 +654,8 @@ The L2 norm of the document's vector representation.
| `tensor` <Tag variant="new">2</Tag> | object | Container for dense vector representations. |
| `cats` <Tag variant="new">2</Tag> | dictionary | Maps either a label to a score for categories applied to whole document, or `(start_char, end_char, label)` to score for categories applied to spans. `start_char` and `end_char` should be character offsets, label can be either a string or an integer ID, and score should be a float. |
| `user_data` | - | A generic storage area, for user custom data. |
| `lang` <Tag variant="new">2.1</Tag> | int | Language of the document's vocabulary. |
| `lang_` <Tag variant="new">2.1</Tag> | unicode | Language of the document's vocabulary. |
| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. |
| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. |
| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. |