mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 00:46:28 +03:00
Add Doc.lang and Doc.lang_
This commit is contained in:
parent
ef80cfde6f
commit
ebcf2bb1c3
|
@ -272,3 +272,9 @@ def test_doc_is_nered(en_vocab):
|
|||
# Test serialization
|
||||
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
|
||||
assert new_doc.is_nered
|
||||
|
||||
|
||||
def test_doc_lang(en_vocab):
|
||||
doc = Doc(en_vocab, words=["Hello", "world"])
|
||||
assert doc.lang_ == "en"
|
||||
assert doc.lang == en_vocab.strings["en"]
|
||||
|
|
|
@ -597,6 +597,16 @@ cdef class Doc:
|
|||
if start != self.length:
|
||||
yield Span(self, start, self.length)
|
||||
|
||||
@property
|
||||
def lang(self):
|
||||
"""RETURNS (uint64): ID of the language of the doc's vocabulary."""
|
||||
return self.vocab.strings[self.vocab.lang]
|
||||
|
||||
@property
|
||||
def lang_(self):
|
||||
"""RETURNS (unicode): Language of the doc's vocabulary, e.g. 'en'."""
|
||||
return self.vocab.lang
|
||||
|
||||
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
|
||||
if self.length == 0:
|
||||
# Flip these to false when we see the first token.
|
||||
|
@ -748,7 +758,7 @@ cdef class Doc:
|
|||
# Allow strings, e.g. 'lemma' or 'LEMMA'
|
||||
attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
|
||||
for id_ in attrs]
|
||||
|
||||
|
||||
if SENT_START in attrs and HEAD in attrs:
|
||||
raise ValueError(Errors.E032)
|
||||
cdef int i, col
|
||||
|
|
|
@ -654,6 +654,8 @@ The L2 norm of the document's vector representation.
|
|||
| `tensor` <Tag variant="new">2</Tag> | object | Container for dense vector representations. |
|
||||
| `cats` <Tag variant="new">2</Tag> | dictionary | Maps either a label to a score for categories applied to whole document, or `(start_char, end_char, label)` to score for categories applied to spans. `start_char` and `end_char` should be character offsets, label can be either a string or an integer ID, and score should be a float. |
|
||||
| `user_data` | - | A generic storage area, for user custom data. |
|
||||
| `lang` <Tag variant="new">2.1</Tag> | int | Language of the document's vocabulary. |
|
||||
| `lang_` <Tag variant="new">2.1</Tag> | unicode | Language of the document's vocabulary. |
|
||||
| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. |
|
||||
| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. |
|
||||
| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. |
|
||||
|
|
Loading…
Reference in New Issue
Block a user