Merge pull request #306 from wbwseeker/german_noun_chunks

add German noun chunk functionality
This commit is contained in:
Matthew Honnibal 2016-04-08 00:54:24 +10:00
commit 872695759d
15 changed files with 228 additions and 162 deletions

View File

@ -63,7 +63,8 @@ MOD_NAMES = [
'spacy.cfile', 'spacy.cfile',
'spacy.matcher', 'spacy.matcher',
'spacy.syntax.ner', 'spacy.syntax.ner',
'spacy.symbols'] 'spacy.symbols',
'spacy.syntax.iterators']
# By subclassing build_extensions we have the actual compiler that will be used # By subclassing build_extensions we have the actual compiler that will be used
@ -213,3 +214,4 @@ def setup_package():
if __name__ == '__main__': if __name__ == '__main__':
setup_package() setup_package()

View File

@ -14,12 +14,12 @@ cpdef enum attr_id_t:
LIKE_EMAIL LIKE_EMAIL
IS_STOP IS_STOP
IS_OOV IS_OOV
IS_BRACKET
IS_QUOTE
IS_LEFT_PUNCT
IS_RIGHT_PUNCT
FLAG14 = 14 FLAG18 = 18
FLAG15
FLAG16
FLAG17
FLAG18
FLAG19 FLAG19
FLAG20 FLAG20
FLAG21 FLAG21
@ -86,10 +86,6 @@ cpdef enum attr_id_t:
SPACY SPACY
PROB PROB
# Move these up to FLAG14--FLAG18 once we finish the functionality and LANG
# are ready to regenerate the model
#IS_BRACKET
#IS_QUOTE
#IS_LEFT_PUNCT
#IS_RIGHT_PUNCT

View File

@ -13,10 +13,10 @@ IDS = {
"LIKE_EMAIL": LIKE_EMAIL, "LIKE_EMAIL": LIKE_EMAIL,
"IS_STOP": IS_STOP, "IS_STOP": IS_STOP,
"IS_OOV": IS_OOV, "IS_OOV": IS_OOV,
"FLAG14": FLAG14, "IS_BRACKET": IS_BRACKET,
"FLAG15": FLAG15, "IS_QUOTE": IS_QUOTE,
"FLAG16": FLAG16, "IS_LEFT_PUNCT": IS_LEFT_PUNCT,
"FLAG17": FLAG17, "IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
"FLAG18": FLAG18, "FLAG18": FLAG18,
"FLAG19": FLAG19, "FLAG19": FLAG19,
"FLAG20": FLAG20, "FLAG20": FLAG20,
@ -83,6 +83,7 @@ IDS = {
"HEAD": HEAD, "HEAD": HEAD,
"SPACY": SPACY, "SPACY": SPACY,
"PROB": PROB, "PROB": PROB,
"LANG": LANG,
} }
# ATTR IDs, in order of the symbol # ATTR IDs, in order of the symbol

View File

@ -34,10 +34,6 @@ class Language(object):
def norm(string): def norm(string):
return string return string
@staticmethod
def shape(string):
return orth.word_shape(string)
@staticmethod @staticmethod
def prefix(string): def prefix(string):
return string[0] return string[0]
@ -50,66 +46,14 @@ class Language(object):
def cluster(string): def cluster(string):
return 0 return 0
@staticmethod
def is_alpha(string):
return orth.is_alpha(string)
@staticmethod
def is_ascii(string):
return orth.is_ascii(string)
@staticmethod @staticmethod
def is_digit(string): def is_digit(string):
return string.isdigit() return string.isdigit()
@staticmethod
def is_lower(string):
return orth.is_lower(string)
@staticmethod
def is_punct(string):
return orth.is_punct(string)
@staticmethod @staticmethod
def is_space(string): def is_space(string):
return string.isspace() return string.isspace()
@staticmethod
def is_title(string):
return orth.is_title(string)
@staticmethod
def is_bracket(string):
return orth.is_bracket(string)
@staticmethod
def is_quote(string):
return orth.is_quote(string)
@staticmethod
def is_left_punct(string):
return orth.is_left_punct(string)
@staticmethod
def is_right_punct(string):
return orth.is_right_punct(string)
@staticmethod
def is_upper(string):
return orth.is_upper(string)
@staticmethod
def like_url(string):
return orth.like_url(string)
@staticmethod
def like_num(string):
return orth.like_number(string)
@staticmethod
def like_email(string):
return orth.like_email(string)
@staticmethod @staticmethod
def is_stop(string): def is_stop(string):
return 0 return 0
@ -120,26 +64,27 @@ class Language(object):
return { return {
attrs.LOWER: cls.lower, attrs.LOWER: cls.lower,
attrs.NORM: cls.norm, attrs.NORM: cls.norm,
attrs.SHAPE: cls.shape, attrs.SHAPE: orth.word_shape,
attrs.PREFIX: cls.prefix, attrs.PREFIX: cls.prefix,
attrs.SUFFIX: cls.suffix, attrs.SUFFIX: cls.suffix,
attrs.CLUSTER: cls.cluster, attrs.CLUSTER: cls.cluster,
attrs.PROB: lambda string: oov_prob, attrs.PROB: lambda string: oov_prob,
attrs.IS_ALPHA: cls.is_alpha, attrs.LANG: lambda string: cls.lang,
attrs.IS_ASCII: cls.is_ascii, attrs.IS_ALPHA: orth.is_alpha,
attrs.IS_ASCII: orth.is_ascii,
attrs.IS_DIGIT: cls.is_digit, attrs.IS_DIGIT: cls.is_digit,
attrs.IS_LOWER: cls.is_lower, attrs.IS_LOWER: orth.is_lower,
attrs.IS_PUNCT: cls.is_punct, attrs.IS_PUNCT: orth.is_punct,
attrs.IS_SPACE: cls.is_space, attrs.IS_SPACE: cls.is_space,
attrs.IS_TITLE: cls.is_title, attrs.IS_TITLE: orth.is_title,
attrs.IS_UPPER: cls.is_upper, attrs.IS_UPPER: orth.is_upper,
attrs.FLAG14: cls.is_bracket, attrs.IS_BRACKET: orth.is_bracket,
attrs.FLAG15: cls.is_quote, attrs.IS_QUOTE: orth.is_quote,
attrs.FLAG16: cls.is_left_punct, attrs.IS_LEFT_PUNCT: orth.is_left_punct,
attrs.FLAG17: cls.is_right_punct, attrs.IS_RIGHT_PUNCT: orth.is_right_punct,
attrs.LIKE_URL: cls.like_url, attrs.LIKE_URL: orth.like_url,
attrs.LIKE_NUM: cls.like_num, attrs.LIKE_NUM: orth.like_number,
attrs.LIKE_EMAIL: cls.like_email, attrs.LIKE_EMAIL: orth.like_email,
attrs.IS_STOP: cls.is_stop, attrs.IS_STOP: cls.is_stop,
attrs.IS_OOV: lambda string: True attrs.IS_OOV: lambda string: True
} }

View File

@ -1,6 +1,6 @@
from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t
from .attrs cimport attr_id_t from .attrs cimport attr_id_t
from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, LANG
from .structs cimport LexemeC from .structs cimport LexemeC
from .strings cimport StringStore from .strings cimport StringStore
@ -41,6 +41,8 @@ cdef class Lexeme:
lex.suffix = value lex.suffix = value
elif name == CLUSTER: elif name == CLUSTER:
lex.cluster = value lex.cluster = value
elif name == LANG:
lex.lang = value
@staticmethod @staticmethod
cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil: cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
@ -67,6 +69,8 @@ cdef class Lexeme:
return lex.length return lex.length
elif feat_name == CLUSTER: elif feat_name == CLUSTER:
return lex.cluster return lex.cluster
elif feat_name == LANG:
return lex.lang
else: else:
return 0 return 0

View File

@ -18,10 +18,10 @@ import numpy
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from .attrs cimport FLAG14 as IS_BRACKET from .attrs cimport IS_BRACKET
from .attrs cimport FLAG15 as IS_QUOTE from .attrs cimport IS_QUOTE
from .attrs cimport FLAG16 as IS_LEFT_PUNCT from .attrs cimport IS_LEFT_PUNCT
from .attrs cimport FLAG17 as IS_RIGHT_PUNCT from .attrs cimport IS_RIGHT_PUNCT
from .attrs cimport IS_OOV from .attrs cimport IS_OOV
@ -74,8 +74,8 @@ cdef class Lexeme:
raise ValueError( raise ValueError(
"Word vectors set to length 0. This may be because the " "Word vectors set to length 0. This may be because the "
"data is not installed. If you haven't already, run" "data is not installed. If you haven't already, run"
"\npython -m spacy.en.download all\n" "\npython -m spacy.%s.download all\n"
"to install the data." "to install the data." % self.vocab.lang
) )
vector_view = <float[:length,]>self.c.vector vector_view = <float[:length,]>self.c.vector
@ -123,6 +123,10 @@ cdef class Lexeme:
def __get__(self): return self.c.cluster def __get__(self): return self.c.cluster
def __set__(self, int x): self.c.cluster = x def __set__(self, int x): self.c.cluster = x
property lang:
def __get__(self): return self.c.lang
def __set__(self, int x): self.c.lang = x
property prob: property prob:
def __get__(self): return self.c.prob def __get__(self): return self.c.prob
def __set__(self, float x): self.c.prob = x def __set__(self, float x): self.c.prob = x
@ -147,6 +151,10 @@ cdef class Lexeme:
def __get__(self): return self.vocab.strings[self.c.suffix] def __get__(self): return self.vocab.strings[self.c.suffix]
def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x] def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x]
property lang_:
def __get__(self): return self.vocab.strings[self.c.lang]
def __set__(self, unicode x): self.c.lang = self.vocab.strings[x]
property flags: property flags:
def __get__(self): return self.c.flags def __get__(self): return self.c.flags
def __set__(self, flags_t x): self.c.flags = x def __set__(self, flags_t x): self.c.flags = x

View File

@ -5,9 +5,6 @@ import unicodedata
import re import re
TAGS = 'adj adp adv conj det noun num pdt pos pron prt punct verb'.upper().split()
# Binary string features # Binary string features
cpdef bint is_alpha(unicode string): cpdef bint is_alpha(unicode string):
return string.isalpha() return string.isalpha()
@ -36,20 +33,25 @@ cpdef bint is_ascii(unicode string):
else: else:
return True return True
cpdef bint is_bracket(unicode string): cpdef bint is_bracket(unicode string):
return False brackets = ('(',')','[',']','{','}','<','>')
return string in brackets
cpdef bint is_quote(unicode string): cpdef bint is_quote(unicode string):
if string in ('"', "'"): quotes = ('"',"'",'`','«','»','','','','','','','','','','','','',"''",'``')
return True return string in quotes
else:
return False
cpdef bint is_left_punct(unicode string): cpdef bint is_left_punct(unicode string):
return False left_punct = ('(','[','{','<','"',"'",'«','','','','','','','','','``')
return string in left_punct
cpdef bint is_right_punct(unicode string): cpdef bint is_right_punct(unicode string):
return False right_punct = (')',']','}','>','"',"'",'»','','','','',"''")
return string in right_punct
cpdef bint is_title(unicode string): cpdef bint is_title(unicode string):

View File

@ -9,6 +9,8 @@ cdef struct LexemeC:
flags_t flags flags_t flags
attr_t lang
attr_t id attr_t id
attr_t length attr_t length

View File

@ -0,0 +1,16 @@
from spacy.tokens.doc cimport Doc
cdef class DocIterator:
cdef Doc _doc
cdef class EnglishNounChunks(DocIterator):
cdef int i
cdef int _np_label
cdef set _np_deps
cdef class GermanNounChunks(DocIterator):
cdef int i
cdef int _np_label
cdef set _np_deps
cdef int _close_app

View File

@ -0,0 +1,82 @@
from spacy.structs cimport TokenC
from spacy.tokens.span cimport Span
from spacy.tokens.doc cimport Doc
from spacy.tokens.token cimport Token
from spacy.parts_of_speech cimport NOUN
# base class for document iterators
cdef class DocIterator:
def __init__(self, Doc doc):
self._doc = doc
def __iter__(self):
return self
def __next__(self):
raise NotImplementedError
cdef class EnglishNounChunks(DocIterator):
def __init__(self, Doc doc):
super(EnglishNounChunks,self).__init__(doc)
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root']
self._np_label = self._doc.vocab.strings['NP']
self._np_deps = set( self._doc.vocab.strings[label] for label in labels )
self._conjunct = self._doc.vocab.strings['conj']
self.i = 0
def __next__(self):
cdef const TokenC* word
cdef widx
while self.i < self._doc.length:
widx = self.i
self.i += 1
word = &self._doc.c[widx]
if word.pos == NOUN:
if word.dep in self._np_deps:
return Span(self._doc, word.l_edge, widx+1, label=self._np_label)
elif word.dep == self._conjunct:
head = word+word.head
while head.dep == self._conjunct and head.head < 0:
head += head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in self._np_deps:
return Span(self._doc, word.l_edge, widx+1, label=self._np_label)
raise StopIteration
# this iterator extracts spans headed by NOUNs starting from the left-most
# syntactic dependent until the NOUN itself
# for close apposition and measurement construction, the span is sometimes
# extended to the right of the NOUN
# example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not
# just "eine Tasse", same for "das Thema Familie"
cdef class GermanNounChunks(DocIterator):
def __init__(self, Doc doc):
super(GermanNounChunks,self).__init__(doc)
labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'root', 'cj', 'pd', 'og', 'app']
self._np_label = self._doc.vocab.strings['NP']
self._np_deps = set( self._doc.vocab.strings[label] for label in labels )
self._close_app = self._doc.vocab.strings['nk']
self.i = 0
def __next__(self):
cdef const TokenC* word
cdef int rbracket
cdef Token rdep
cdef widx
while self.i < self._doc.length:
widx = self.i
self.i += 1
word = &self._doc.c[widx]
if word.pos == NOUN and word.dep in self._np_deps:
rbracket = widx+1
# try to extend the span to the right
# to capture close apposition/measurement constructions
for rdep in self._doc[widx].rights:
if rdep.pos == NOUN and rdep.dep == self._close_app:
rbracket = rdep.i+1
return Span(self._doc, word.l_edge, rbracket, label=self._np_label)
raise StopIteration

View File

@ -47,6 +47,8 @@ from ._parse_features cimport fill_context
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ._state cimport StateC from ._state cimport StateC
from spacy.syntax.iterators cimport DocIterator, EnglishNounChunks, GermanNounChunks
CHUNKERS = {'en':EnglishNounChunks, 'de':GermanNounChunks}
DEBUG = False DEBUG = False
@ -113,12 +115,9 @@ cdef class Parser:
cdef int nr_feat = self.model.nr_feat cdef int nr_feat = self.model.nr_feat
with nogil: with nogil:
self.parseC(tokens.c, tokens.length, nr_feat, nr_class) self.parseC(tokens.c, tokens.length, nr_feat, nr_class)
tokens.is_parsed = True
# Check for KeyboardInterrupt etc. Untested # Check for KeyboardInterrupt etc. Untested
PyErr_CheckSignals() PyErr_CheckSignals()
# projectivize output self._finalize(tokens)
if self._projectivize:
PseudoProjectivity.deprojectivize(tokens)
def pipe(self, stream, int batch_size=1000, int n_threads=2): def pipe(self, stream, int batch_size=1000, int n_threads=2):
cdef Pool mem = Pool() cdef Pool mem = Pool()
@ -144,7 +143,7 @@ cdef class Parser:
raise ValueError("Error parsing doc: %s" % sent_str) raise ValueError("Error parsing doc: %s" % sent_str)
PyErr_CheckSignals() PyErr_CheckSignals()
for doc in queue: for doc in queue:
doc.is_parsed = True self._finalize(doc)
yield doc yield doc
queue = [] queue = []
batch_size = len(queue) batch_size = len(queue)
@ -155,10 +154,19 @@ cdef class Parser:
with gil: with gil:
sent_str = queue[i].text sent_str = queue[i].text
raise ValueError("Error parsing doc: %s" % sent_str) raise ValueError("Error parsing doc: %s" % sent_str)
for doc in queue:
doc.is_parsed = True
yield doc
PyErr_CheckSignals() PyErr_CheckSignals()
for doc in queue:
self._finalize(doc)
yield doc
def _finalize(self, Doc doc):
# deprojectivize output
if self._projectivize:
PseudoProjectivity.deprojectivize(doc)
# set annotation-specific iterators
doc.noun_chunks = CHUNKERS.get(doc.vocab.lang,DocIterator)
# mark doc as parsed
doc.is_parsed = True
cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil: cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil:
cdef ExampleC eg cdef ExampleC eg

View File

@ -7,6 +7,8 @@ from ..structs cimport TokenC, LexemeC
from ..typedefs cimport attr_t from ..typedefs cimport attr_t
from ..attrs cimport attr_id_t from ..attrs cimport attr_id_t
from spacy.syntax.iterators cimport DocIterator
cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil
@ -42,6 +44,8 @@ cdef class Doc:
cdef int length cdef int length
cdef int max_length cdef int max_length
cdef DocIterator noun_chunks_iterator
cdef int push_back(self, LexemeOrToken lex_or_tok, bint trailing_space) except -1 cdef int push_back(self, LexemeOrToken lex_or_tok, bint trailing_space) except -1
cpdef np.ndarray to_array(self, object features) cpdef np.ndarray to_array(self, object features)

View File

@ -8,6 +8,7 @@ import struct
cimport numpy as np cimport numpy as np
import math import math
import six import six
import warnings
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from ..lexeme cimport EMPTY_LEXEME from ..lexeme cimport EMPTY_LEXEME
@ -80,6 +81,7 @@ cdef class Doc:
self.is_parsed = False self.is_parsed = False
self._py_tokens = [] self._py_tokens = []
self._vector = None self._vector = None
self.noun_chunks_iterator = DocIterator(self)
def __getitem__(self, object i): def __getitem__(self, object i):
"""Get a Token or a Span from the Doc. """Get a Token or a Span from the Doc.
@ -230,33 +232,22 @@ cdef class Doc:
# Set start as B # Set start as B
self.c[start].ent_iob = 3 self.c[start].ent_iob = 3
@property
def noun_chunks(self): property noun_chunks:
def __get__(self):
"""Yield spans for base noun phrases.""" """Yield spans for base noun phrases."""
if not self.is_parsed: if not self.is_parsed:
raise ValueError( raise ValueError(
"noun_chunks requires the dependency parse, which " "noun_chunks requires the dependency parse, which "
"requires data to be installed. If you haven't done so, run: " "requires data to be installed. If you haven't done so, run: "
"\npython -m spacy.en.download all\n" "\npython -m spacy.%s.download all\n"
"to install the data") "to install the data" % self.vocab.lang)
yield from self.noun_chunks_iterator
def __set__(self, DocIterator):
self.noun_chunks_iterator = DocIterator(self)
cdef const TokenC* word
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
'attr', 'root']
np_deps = [self.vocab.strings[label] for label in labels]
conj = self.vocab.strings['conj']
np_label = self.vocab.strings['NP']
for i in range(self.length):
word = &self.c[i]
if word.pos == NOUN and word.dep in np_deps:
yield Span(self, word.l_edge, i+1, label=np_label)
elif word.pos == NOUN and word.dep == conj:
head = word+word.head
while head.dep == conj and head.head < 0:
head += head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
yield Span(self, word.l_edge, i+1, label=np_label)
@property @property
def sents(self): def sents(self):
@ -267,8 +258,8 @@ cdef class Doc:
raise ValueError( raise ValueError(
"sentence boundary detection requires the dependency parse, which " "sentence boundary detection requires the dependency parse, which "
"requires data to be installed. If you haven't done so, run: " "requires data to be installed. If you haven't done so, run: "
"\npython -m spacy.en.download all\n" "\npython -m spacy.%s.download all\n"
"to install the data") "to install the data" % self.vocab.lang)
cdef int i cdef int i
start = 0 start = 0
for i in range(1, self.length): for i in range(1, self.length):

View File

@ -18,10 +18,10 @@ from ..attrs cimport POS, LEMMA, TAG, DEP
from ..parts_of_speech cimport CONJ, PUNCT from ..parts_of_speech cimport CONJ, PUNCT
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from ..attrs cimport FLAG14 as IS_BRACKET from ..attrs cimport IS_BRACKET
from ..attrs cimport FLAG15 as IS_QUOTE from ..attrs cimport IS_QUOTE
from ..attrs cimport FLAG16 as IS_LEFT_PUNCT from ..attrs cimport IS_LEFT_PUNCT
from ..attrs cimport FLAG17 as IS_RIGHT_PUNCT from ..attrs cimport IS_RIGHT_PUNCT
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from ..attrs cimport IS_OOV from ..attrs cimport IS_OOV
@ -95,6 +95,10 @@ cdef class Token:
def __get__(self): def __get__(self):
return self.c.lex.prob return self.c.lex.prob
property lang:
def __get__(self):
return self.c.lex.lang
property idx: property idx:
def __get__(self): def __get__(self):
return self.c.idx return self.c.idx
@ -161,8 +165,8 @@ cdef class Token:
raise ValueError( raise ValueError(
"Word vectors set to length 0. This may be because the " "Word vectors set to length 0. This may be because the "
"data is not installed. If you haven't already, run" "data is not installed. If you haven't already, run"
"\npython -m spacy.en.download all\n" "\npython -m spacy.%s.download all\n"
"to install the data." "to install the data." % self.vocab.lang
) )
vector_view = <float[:length,]>self.c.lex.vector vector_view = <float[:length,]>self.c.lex.vector
return numpy.asarray(vector_view) return numpy.asarray(vector_view)
@ -177,23 +181,11 @@ cdef class Token:
property n_lefts: property n_lefts:
def __get__(self): def __get__(self):
cdef int n = 0 return self.c.l_kids
cdef const TokenC* ptr = self.c - self.i
while ptr != self.c:
if ptr + ptr.head == self.c:
n += 1
ptr += 1
return n
property n_rights: property n_rights:
def __get__(self): def __get__(self):
cdef int n = 0 return self.c.r_kids
cdef const TokenC* ptr = self.c + (self.array_len - self.i)
while ptr != self.c:
if ptr + ptr.head == self.c:
n += 1
ptr -= 1
return n
property lefts: property lefts:
def __get__(self): def __get__(self):
@ -415,6 +407,10 @@ cdef class Token:
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.lex.suffix] return self.vocab.strings[self.c.lex.suffix]
property lang_:
def __get__(self):
return self.vocab.strings[self.c.lex.lang]
property lemma_: property lemma_:
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.lemma] return self.vocab.strings[self.c.lemma]

View File

@ -26,7 +26,7 @@ from . import symbols
from cymem.cymem cimport Address from cymem.cymem cimport Address
from .serialize.packer cimport Packer from .serialize.packer cimport Packer
from .attrs cimport PROB from .attrs cimport PROB, LANG
try: try:
import copy_reg import copy_reg
@ -104,6 +104,13 @@ cdef class Vocab:
self._serializer = Packer(self, self.serializer_freqs) self._serializer = Packer(self, self.serializer_freqs)
return self._serializer return self._serializer
property lang:
def __get__(self):
langfunc = None
if self.get_lex_attr:
langfunc = self.get_lex_attr.get(LANG,None)
return langfunc('_') if langfunc else ''
def __len__(self): def __len__(self):
"""The current number of lexemes stored.""" """The current number of lexemes stored."""
return self.length return self.length
@ -245,6 +252,7 @@ cdef class Vocab:
fp.write_from(&lexeme.prob, sizeof(lexeme.prob), 1) fp.write_from(&lexeme.prob, sizeof(lexeme.prob), 1)
fp.write_from(&lexeme.sentiment, sizeof(lexeme.sentiment), 1) fp.write_from(&lexeme.sentiment, sizeof(lexeme.sentiment), 1)
fp.write_from(&lexeme.l2_norm, sizeof(lexeme.l2_norm), 1) fp.write_from(&lexeme.l2_norm, sizeof(lexeme.l2_norm), 1)
fp.write_from(&lexeme.lang, sizeof(lexeme.lang), 1)
fp.close() fp.close()
def load_lexemes(self, loc): def load_lexemes(self, loc):
@ -277,6 +285,7 @@ cdef class Vocab:
fp.read_into(&lexeme.prob, 1, sizeof(lexeme.prob)) fp.read_into(&lexeme.prob, 1, sizeof(lexeme.prob))
fp.read_into(&lexeme.sentiment, 1, sizeof(lexeme.sentiment)) fp.read_into(&lexeme.sentiment, 1, sizeof(lexeme.sentiment))
fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm)) fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm))
fp.read_into(&lexeme.lang, 1, sizeof(lexeme.lang))
lexeme.vector = EMPTY_VEC lexeme.vector = EMPTY_VEC
py_str = self.strings[lexeme.orth] py_str = self.strings[lexeme.orth]