mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-14 05:37:03 +03:00
Merge pull request #306 from wbwseeker/german_noun_chunks
add German noun chunk functionality
This commit is contained in:
commit
872695759d
4
setup.py
4
setup.py
|
@ -63,7 +63,8 @@ MOD_NAMES = [
|
||||||
'spacy.cfile',
|
'spacy.cfile',
|
||||||
'spacy.matcher',
|
'spacy.matcher',
|
||||||
'spacy.syntax.ner',
|
'spacy.syntax.ner',
|
||||||
'spacy.symbols']
|
'spacy.symbols',
|
||||||
|
'spacy.syntax.iterators']
|
||||||
|
|
||||||
|
|
||||||
# By subclassing build_extensions we have the actual compiler that will be used
|
# By subclassing build_extensions we have the actual compiler that will be used
|
||||||
|
@ -213,3 +214,4 @@ def setup_package():
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
setup_package()
|
setup_package()
|
||||||
|
|
||||||
|
|
|
@ -14,12 +14,12 @@ cpdef enum attr_id_t:
|
||||||
LIKE_EMAIL
|
LIKE_EMAIL
|
||||||
IS_STOP
|
IS_STOP
|
||||||
IS_OOV
|
IS_OOV
|
||||||
|
IS_BRACKET
|
||||||
|
IS_QUOTE
|
||||||
|
IS_LEFT_PUNCT
|
||||||
|
IS_RIGHT_PUNCT
|
||||||
|
|
||||||
FLAG14 = 14
|
FLAG18 = 18
|
||||||
FLAG15
|
|
||||||
FLAG16
|
|
||||||
FLAG17
|
|
||||||
FLAG18
|
|
||||||
FLAG19
|
FLAG19
|
||||||
FLAG20
|
FLAG20
|
||||||
FLAG21
|
FLAG21
|
||||||
|
@ -86,10 +86,6 @@ cpdef enum attr_id_t:
|
||||||
SPACY
|
SPACY
|
||||||
PROB
|
PROB
|
||||||
|
|
||||||
# Move these up to FLAG14--FLAG18 once we finish the functionality and
|
LANG
|
||||||
# are ready to regenerate the model
|
|
||||||
#IS_BRACKET
|
|
||||||
#IS_QUOTE
|
|
||||||
#IS_LEFT_PUNCT
|
|
||||||
#IS_RIGHT_PUNCT
|
|
||||||
|
|
||||||
|
|
|
@ -13,10 +13,10 @@ IDS = {
|
||||||
"LIKE_EMAIL": LIKE_EMAIL,
|
"LIKE_EMAIL": LIKE_EMAIL,
|
||||||
"IS_STOP": IS_STOP,
|
"IS_STOP": IS_STOP,
|
||||||
"IS_OOV": IS_OOV,
|
"IS_OOV": IS_OOV,
|
||||||
"FLAG14": FLAG14,
|
"IS_BRACKET": IS_BRACKET,
|
||||||
"FLAG15": FLAG15,
|
"IS_QUOTE": IS_QUOTE,
|
||||||
"FLAG16": FLAG16,
|
"IS_LEFT_PUNCT": IS_LEFT_PUNCT,
|
||||||
"FLAG17": FLAG17,
|
"IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
|
||||||
"FLAG18": FLAG18,
|
"FLAG18": FLAG18,
|
||||||
"FLAG19": FLAG19,
|
"FLAG19": FLAG19,
|
||||||
"FLAG20": FLAG20,
|
"FLAG20": FLAG20,
|
||||||
|
@ -83,6 +83,7 @@ IDS = {
|
||||||
"HEAD": HEAD,
|
"HEAD": HEAD,
|
||||||
"SPACY": SPACY,
|
"SPACY": SPACY,
|
||||||
"PROB": PROB,
|
"PROB": PROB,
|
||||||
|
"LANG": LANG,
|
||||||
}
|
}
|
||||||
|
|
||||||
# ATTR IDs, in order of the symbol
|
# ATTR IDs, in order of the symbol
|
||||||
|
|
|
@ -34,10 +34,6 @@ class Language(object):
|
||||||
def norm(string):
|
def norm(string):
|
||||||
return string
|
return string
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def shape(string):
|
|
||||||
return orth.word_shape(string)
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def prefix(string):
|
def prefix(string):
|
||||||
return string[0]
|
return string[0]
|
||||||
|
@ -50,66 +46,14 @@ class Language(object):
|
||||||
def cluster(string):
|
def cluster(string):
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def is_alpha(string):
|
|
||||||
return orth.is_alpha(string)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def is_ascii(string):
|
|
||||||
return orth.is_ascii(string)
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def is_digit(string):
|
def is_digit(string):
|
||||||
return string.isdigit()
|
return string.isdigit()
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def is_lower(string):
|
|
||||||
return orth.is_lower(string)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def is_punct(string):
|
|
||||||
return orth.is_punct(string)
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def is_space(string):
|
def is_space(string):
|
||||||
return string.isspace()
|
return string.isspace()
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def is_title(string):
|
|
||||||
return orth.is_title(string)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def is_bracket(string):
|
|
||||||
return orth.is_bracket(string)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def is_quote(string):
|
|
||||||
return orth.is_quote(string)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def is_left_punct(string):
|
|
||||||
return orth.is_left_punct(string)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def is_right_punct(string):
|
|
||||||
return orth.is_right_punct(string)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def is_upper(string):
|
|
||||||
return orth.is_upper(string)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def like_url(string):
|
|
||||||
return orth.like_url(string)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def like_num(string):
|
|
||||||
return orth.like_number(string)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def like_email(string):
|
|
||||||
return orth.like_email(string)
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def is_stop(string):
|
def is_stop(string):
|
||||||
return 0
|
return 0
|
||||||
|
@ -120,26 +64,27 @@ class Language(object):
|
||||||
return {
|
return {
|
||||||
attrs.LOWER: cls.lower,
|
attrs.LOWER: cls.lower,
|
||||||
attrs.NORM: cls.norm,
|
attrs.NORM: cls.norm,
|
||||||
attrs.SHAPE: cls.shape,
|
attrs.SHAPE: orth.word_shape,
|
||||||
attrs.PREFIX: cls.prefix,
|
attrs.PREFIX: cls.prefix,
|
||||||
attrs.SUFFIX: cls.suffix,
|
attrs.SUFFIX: cls.suffix,
|
||||||
attrs.CLUSTER: cls.cluster,
|
attrs.CLUSTER: cls.cluster,
|
||||||
attrs.PROB: lambda string: oov_prob,
|
attrs.PROB: lambda string: oov_prob,
|
||||||
attrs.IS_ALPHA: cls.is_alpha,
|
attrs.LANG: lambda string: cls.lang,
|
||||||
attrs.IS_ASCII: cls.is_ascii,
|
attrs.IS_ALPHA: orth.is_alpha,
|
||||||
|
attrs.IS_ASCII: orth.is_ascii,
|
||||||
attrs.IS_DIGIT: cls.is_digit,
|
attrs.IS_DIGIT: cls.is_digit,
|
||||||
attrs.IS_LOWER: cls.is_lower,
|
attrs.IS_LOWER: orth.is_lower,
|
||||||
attrs.IS_PUNCT: cls.is_punct,
|
attrs.IS_PUNCT: orth.is_punct,
|
||||||
attrs.IS_SPACE: cls.is_space,
|
attrs.IS_SPACE: cls.is_space,
|
||||||
attrs.IS_TITLE: cls.is_title,
|
attrs.IS_TITLE: orth.is_title,
|
||||||
attrs.IS_UPPER: cls.is_upper,
|
attrs.IS_UPPER: orth.is_upper,
|
||||||
attrs.FLAG14: cls.is_bracket,
|
attrs.IS_BRACKET: orth.is_bracket,
|
||||||
attrs.FLAG15: cls.is_quote,
|
attrs.IS_QUOTE: orth.is_quote,
|
||||||
attrs.FLAG16: cls.is_left_punct,
|
attrs.IS_LEFT_PUNCT: orth.is_left_punct,
|
||||||
attrs.FLAG17: cls.is_right_punct,
|
attrs.IS_RIGHT_PUNCT: orth.is_right_punct,
|
||||||
attrs.LIKE_URL: cls.like_url,
|
attrs.LIKE_URL: orth.like_url,
|
||||||
attrs.LIKE_NUM: cls.like_num,
|
attrs.LIKE_NUM: orth.like_number,
|
||||||
attrs.LIKE_EMAIL: cls.like_email,
|
attrs.LIKE_EMAIL: orth.like_email,
|
||||||
attrs.IS_STOP: cls.is_stop,
|
attrs.IS_STOP: cls.is_stop,
|
||||||
attrs.IS_OOV: lambda string: True
|
attrs.IS_OOV: lambda string: True
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t
|
from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t
|
||||||
from .attrs cimport attr_id_t
|
from .attrs cimport attr_id_t
|
||||||
from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, LANG
|
||||||
|
|
||||||
from .structs cimport LexemeC
|
from .structs cimport LexemeC
|
||||||
from .strings cimport StringStore
|
from .strings cimport StringStore
|
||||||
|
@ -41,6 +41,8 @@ cdef class Lexeme:
|
||||||
lex.suffix = value
|
lex.suffix = value
|
||||||
elif name == CLUSTER:
|
elif name == CLUSTER:
|
||||||
lex.cluster = value
|
lex.cluster = value
|
||||||
|
elif name == LANG:
|
||||||
|
lex.lang = value
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
|
cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
|
||||||
|
@ -67,6 +69,8 @@ cdef class Lexeme:
|
||||||
return lex.length
|
return lex.length
|
||||||
elif feat_name == CLUSTER:
|
elif feat_name == CLUSTER:
|
||||||
return lex.cluster
|
return lex.cluster
|
||||||
|
elif feat_name == LANG:
|
||||||
|
return lex.lang
|
||||||
else:
|
else:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
|
@ -18,10 +18,10 @@ import numpy
|
||||||
|
|
||||||
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||||
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
||||||
from .attrs cimport FLAG14 as IS_BRACKET
|
from .attrs cimport IS_BRACKET
|
||||||
from .attrs cimport FLAG15 as IS_QUOTE
|
from .attrs cimport IS_QUOTE
|
||||||
from .attrs cimport FLAG16 as IS_LEFT_PUNCT
|
from .attrs cimport IS_LEFT_PUNCT
|
||||||
from .attrs cimport FLAG17 as IS_RIGHT_PUNCT
|
from .attrs cimport IS_RIGHT_PUNCT
|
||||||
from .attrs cimport IS_OOV
|
from .attrs cimport IS_OOV
|
||||||
|
|
||||||
|
|
||||||
|
@ -74,8 +74,8 @@ cdef class Lexeme:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Word vectors set to length 0. This may be because the "
|
"Word vectors set to length 0. This may be because the "
|
||||||
"data is not installed. If you haven't already, run"
|
"data is not installed. If you haven't already, run"
|
||||||
"\npython -m spacy.en.download all\n"
|
"\npython -m spacy.%s.download all\n"
|
||||||
"to install the data."
|
"to install the data." % self.vocab.lang
|
||||||
)
|
)
|
||||||
|
|
||||||
vector_view = <float[:length,]>self.c.vector
|
vector_view = <float[:length,]>self.c.vector
|
||||||
|
@ -123,6 +123,10 @@ cdef class Lexeme:
|
||||||
def __get__(self): return self.c.cluster
|
def __get__(self): return self.c.cluster
|
||||||
def __set__(self, int x): self.c.cluster = x
|
def __set__(self, int x): self.c.cluster = x
|
||||||
|
|
||||||
|
property lang:
|
||||||
|
def __get__(self): return self.c.lang
|
||||||
|
def __set__(self, int x): self.c.lang = x
|
||||||
|
|
||||||
property prob:
|
property prob:
|
||||||
def __get__(self): return self.c.prob
|
def __get__(self): return self.c.prob
|
||||||
def __set__(self, float x): self.c.prob = x
|
def __set__(self, float x): self.c.prob = x
|
||||||
|
@ -147,6 +151,10 @@ cdef class Lexeme:
|
||||||
def __get__(self): return self.vocab.strings[self.c.suffix]
|
def __get__(self): return self.vocab.strings[self.c.suffix]
|
||||||
def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x]
|
def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x]
|
||||||
|
|
||||||
|
property lang_:
|
||||||
|
def __get__(self): return self.vocab.strings[self.c.lang]
|
||||||
|
def __set__(self, unicode x): self.c.lang = self.vocab.strings[x]
|
||||||
|
|
||||||
property flags:
|
property flags:
|
||||||
def __get__(self): return self.c.flags
|
def __get__(self): return self.c.flags
|
||||||
def __set__(self, flags_t x): self.c.flags = x
|
def __set__(self, flags_t x): self.c.flags = x
|
||||||
|
|
|
@ -5,9 +5,6 @@ import unicodedata
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
TAGS = 'adj adp adv conj det noun num pdt pos pron prt punct verb'.upper().split()
|
|
||||||
|
|
||||||
|
|
||||||
# Binary string features
|
# Binary string features
|
||||||
cpdef bint is_alpha(unicode string):
|
cpdef bint is_alpha(unicode string):
|
||||||
return string.isalpha()
|
return string.isalpha()
|
||||||
|
@ -36,20 +33,25 @@ cpdef bint is_ascii(unicode string):
|
||||||
else:
|
else:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
cpdef bint is_bracket(unicode string):
|
cpdef bint is_bracket(unicode string):
|
||||||
return False
|
brackets = ('(',')','[',']','{','}','<','>')
|
||||||
|
return string in brackets
|
||||||
|
|
||||||
|
|
||||||
cpdef bint is_quote(unicode string):
|
cpdef bint is_quote(unicode string):
|
||||||
if string in ('"', "'"):
|
quotes = ('"',"'",'`','«','»','‘','’','‚','‛','“','”','„','‟','‹','›','❮','❯',"''",'``')
|
||||||
return True
|
return string in quotes
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
cpdef bint is_left_punct(unicode string):
|
cpdef bint is_left_punct(unicode string):
|
||||||
return False
|
left_punct = ('(','[','{','<','"',"'",'«','‘','‚','‛','“','„','‟','‹','❮','``')
|
||||||
|
return string in left_punct
|
||||||
|
|
||||||
|
|
||||||
cpdef bint is_right_punct(unicode string):
|
cpdef bint is_right_punct(unicode string):
|
||||||
return False
|
right_punct = (')',']','}','>','"',"'",'»','’','”','›','❯',"''")
|
||||||
|
return string in right_punct
|
||||||
|
|
||||||
|
|
||||||
cpdef bint is_title(unicode string):
|
cpdef bint is_title(unicode string):
|
||||||
|
|
|
@ -9,6 +9,8 @@ cdef struct LexemeC:
|
||||||
|
|
||||||
flags_t flags
|
flags_t flags
|
||||||
|
|
||||||
|
attr_t lang
|
||||||
|
|
||||||
attr_t id
|
attr_t id
|
||||||
attr_t length
|
attr_t length
|
||||||
|
|
||||||
|
|
16
spacy/syntax/iterators.pxd
Normal file
16
spacy/syntax/iterators.pxd
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
|
||||||
|
from spacy.tokens.doc cimport Doc
|
||||||
|
|
||||||
|
cdef class DocIterator:
|
||||||
|
cdef Doc _doc
|
||||||
|
|
||||||
|
cdef class EnglishNounChunks(DocIterator):
|
||||||
|
cdef int i
|
||||||
|
cdef int _np_label
|
||||||
|
cdef set _np_deps
|
||||||
|
|
||||||
|
cdef class GermanNounChunks(DocIterator):
|
||||||
|
cdef int i
|
||||||
|
cdef int _np_label
|
||||||
|
cdef set _np_deps
|
||||||
|
cdef int _close_app
|
82
spacy/syntax/iterators.pyx
Normal file
82
spacy/syntax/iterators.pyx
Normal file
|
@ -0,0 +1,82 @@
|
||||||
|
from spacy.structs cimport TokenC
|
||||||
|
from spacy.tokens.span cimport Span
|
||||||
|
from spacy.tokens.doc cimport Doc
|
||||||
|
from spacy.tokens.token cimport Token
|
||||||
|
|
||||||
|
from spacy.parts_of_speech cimport NOUN
|
||||||
|
|
||||||
|
# base class for document iterators
|
||||||
|
cdef class DocIterator:
|
||||||
|
def __init__(self, Doc doc):
|
||||||
|
self._doc = doc
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __next__(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
cdef class EnglishNounChunks(DocIterator):
|
||||||
|
def __init__(self, Doc doc):
|
||||||
|
super(EnglishNounChunks,self).__init__(doc)
|
||||||
|
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root']
|
||||||
|
self._np_label = self._doc.vocab.strings['NP']
|
||||||
|
self._np_deps = set( self._doc.vocab.strings[label] for label in labels )
|
||||||
|
self._conjunct = self._doc.vocab.strings['conj']
|
||||||
|
self.i = 0
|
||||||
|
|
||||||
|
def __next__(self):
|
||||||
|
cdef const TokenC* word
|
||||||
|
cdef widx
|
||||||
|
while self.i < self._doc.length:
|
||||||
|
widx = self.i
|
||||||
|
self.i += 1
|
||||||
|
word = &self._doc.c[widx]
|
||||||
|
if word.pos == NOUN:
|
||||||
|
if word.dep in self._np_deps:
|
||||||
|
return Span(self._doc, word.l_edge, widx+1, label=self._np_label)
|
||||||
|
elif word.dep == self._conjunct:
|
||||||
|
head = word+word.head
|
||||||
|
while head.dep == self._conjunct and head.head < 0:
|
||||||
|
head += head.head
|
||||||
|
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||||
|
if head.dep in self._np_deps:
|
||||||
|
return Span(self._doc, word.l_edge, widx+1, label=self._np_label)
|
||||||
|
raise StopIteration
|
||||||
|
|
||||||
|
|
||||||
|
# this iterator extracts spans headed by NOUNs starting from the left-most
|
||||||
|
# syntactic dependent until the NOUN itself
|
||||||
|
# for close apposition and measurement construction, the span is sometimes
|
||||||
|
# extended to the right of the NOUN
|
||||||
|
# example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not
|
||||||
|
# just "eine Tasse", same for "das Thema Familie"
|
||||||
|
cdef class GermanNounChunks(DocIterator):
|
||||||
|
def __init__(self, Doc doc):
|
||||||
|
super(GermanNounChunks,self).__init__(doc)
|
||||||
|
labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'root', 'cj', 'pd', 'og', 'app']
|
||||||
|
self._np_label = self._doc.vocab.strings['NP']
|
||||||
|
self._np_deps = set( self._doc.vocab.strings[label] for label in labels )
|
||||||
|
self._close_app = self._doc.vocab.strings['nk']
|
||||||
|
self.i = 0
|
||||||
|
|
||||||
|
def __next__(self):
|
||||||
|
cdef const TokenC* word
|
||||||
|
cdef int rbracket
|
||||||
|
cdef Token rdep
|
||||||
|
cdef widx
|
||||||
|
while self.i < self._doc.length:
|
||||||
|
widx = self.i
|
||||||
|
self.i += 1
|
||||||
|
word = &self._doc.c[widx]
|
||||||
|
if word.pos == NOUN and word.dep in self._np_deps:
|
||||||
|
rbracket = widx+1
|
||||||
|
# try to extend the span to the right
|
||||||
|
# to capture close apposition/measurement constructions
|
||||||
|
for rdep in self._doc[widx].rights:
|
||||||
|
if rdep.pos == NOUN and rdep.dep == self._close_app:
|
||||||
|
rbracket = rdep.i+1
|
||||||
|
return Span(self._doc, word.l_edge, rbracket, label=self._np_label)
|
||||||
|
raise StopIteration
|
||||||
|
|
|
@ -47,6 +47,8 @@ from ._parse_features cimport fill_context
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ._state cimport StateC
|
from ._state cimport StateC
|
||||||
|
|
||||||
|
from spacy.syntax.iterators cimport DocIterator, EnglishNounChunks, GermanNounChunks
|
||||||
|
CHUNKERS = {'en':EnglishNounChunks, 'de':GermanNounChunks}
|
||||||
|
|
||||||
|
|
||||||
DEBUG = False
|
DEBUG = False
|
||||||
|
@ -113,12 +115,9 @@ cdef class Parser:
|
||||||
cdef int nr_feat = self.model.nr_feat
|
cdef int nr_feat = self.model.nr_feat
|
||||||
with nogil:
|
with nogil:
|
||||||
self.parseC(tokens.c, tokens.length, nr_feat, nr_class)
|
self.parseC(tokens.c, tokens.length, nr_feat, nr_class)
|
||||||
tokens.is_parsed = True
|
|
||||||
# Check for KeyboardInterrupt etc. Untested
|
# Check for KeyboardInterrupt etc. Untested
|
||||||
PyErr_CheckSignals()
|
PyErr_CheckSignals()
|
||||||
# projectivize output
|
self._finalize(tokens)
|
||||||
if self._projectivize:
|
|
||||||
PseudoProjectivity.deprojectivize(tokens)
|
|
||||||
|
|
||||||
def pipe(self, stream, int batch_size=1000, int n_threads=2):
|
def pipe(self, stream, int batch_size=1000, int n_threads=2):
|
||||||
cdef Pool mem = Pool()
|
cdef Pool mem = Pool()
|
||||||
|
@ -144,7 +143,7 @@ cdef class Parser:
|
||||||
raise ValueError("Error parsing doc: %s" % sent_str)
|
raise ValueError("Error parsing doc: %s" % sent_str)
|
||||||
PyErr_CheckSignals()
|
PyErr_CheckSignals()
|
||||||
for doc in queue:
|
for doc in queue:
|
||||||
doc.is_parsed = True
|
self._finalize(doc)
|
||||||
yield doc
|
yield doc
|
||||||
queue = []
|
queue = []
|
||||||
batch_size = len(queue)
|
batch_size = len(queue)
|
||||||
|
@ -155,10 +154,19 @@ cdef class Parser:
|
||||||
with gil:
|
with gil:
|
||||||
sent_str = queue[i].text
|
sent_str = queue[i].text
|
||||||
raise ValueError("Error parsing doc: %s" % sent_str)
|
raise ValueError("Error parsing doc: %s" % sent_str)
|
||||||
for doc in queue:
|
|
||||||
doc.is_parsed = True
|
|
||||||
yield doc
|
|
||||||
PyErr_CheckSignals()
|
PyErr_CheckSignals()
|
||||||
|
for doc in queue:
|
||||||
|
self._finalize(doc)
|
||||||
|
yield doc
|
||||||
|
|
||||||
|
def _finalize(self, Doc doc):
|
||||||
|
# deprojectivize output
|
||||||
|
if self._projectivize:
|
||||||
|
PseudoProjectivity.deprojectivize(doc)
|
||||||
|
# set annotation-specific iterators
|
||||||
|
doc.noun_chunks = CHUNKERS.get(doc.vocab.lang,DocIterator)
|
||||||
|
# mark doc as parsed
|
||||||
|
doc.is_parsed = True
|
||||||
|
|
||||||
cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil:
|
cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil:
|
||||||
cdef ExampleC eg
|
cdef ExampleC eg
|
||||||
|
|
|
@ -7,6 +7,8 @@ from ..structs cimport TokenC, LexemeC
|
||||||
from ..typedefs cimport attr_t
|
from ..typedefs cimport attr_t
|
||||||
from ..attrs cimport attr_id_t
|
from ..attrs cimport attr_id_t
|
||||||
|
|
||||||
|
from spacy.syntax.iterators cimport DocIterator
|
||||||
|
|
||||||
|
|
||||||
cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil
|
cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil
|
||||||
|
|
||||||
|
@ -42,6 +44,8 @@ cdef class Doc:
|
||||||
cdef int length
|
cdef int length
|
||||||
cdef int max_length
|
cdef int max_length
|
||||||
|
|
||||||
|
cdef DocIterator noun_chunks_iterator
|
||||||
|
|
||||||
cdef int push_back(self, LexemeOrToken lex_or_tok, bint trailing_space) except -1
|
cdef int push_back(self, LexemeOrToken lex_or_tok, bint trailing_space) except -1
|
||||||
|
|
||||||
cpdef np.ndarray to_array(self, object features)
|
cpdef np.ndarray to_array(self, object features)
|
||||||
|
|
|
@ -8,6 +8,7 @@ import struct
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
import math
|
import math
|
||||||
import six
|
import six
|
||||||
|
import warnings
|
||||||
|
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
from ..lexeme cimport EMPTY_LEXEME
|
from ..lexeme cimport EMPTY_LEXEME
|
||||||
|
@ -80,6 +81,7 @@ cdef class Doc:
|
||||||
self.is_parsed = False
|
self.is_parsed = False
|
||||||
self._py_tokens = []
|
self._py_tokens = []
|
||||||
self._vector = None
|
self._vector = None
|
||||||
|
self.noun_chunks_iterator = DocIterator(self)
|
||||||
|
|
||||||
def __getitem__(self, object i):
|
def __getitem__(self, object i):
|
||||||
"""Get a Token or a Span from the Doc.
|
"""Get a Token or a Span from the Doc.
|
||||||
|
@ -230,33 +232,22 @@ cdef class Doc:
|
||||||
# Set start as B
|
# Set start as B
|
||||||
self.c[start].ent_iob = 3
|
self.c[start].ent_iob = 3
|
||||||
|
|
||||||
@property
|
|
||||||
def noun_chunks(self):
|
property noun_chunks:
|
||||||
|
def __get__(self):
|
||||||
"""Yield spans for base noun phrases."""
|
"""Yield spans for base noun phrases."""
|
||||||
if not self.is_parsed:
|
if not self.is_parsed:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"noun_chunks requires the dependency parse, which "
|
"noun_chunks requires the dependency parse, which "
|
||||||
"requires data to be installed. If you haven't done so, run: "
|
"requires data to be installed. If you haven't done so, run: "
|
||||||
"\npython -m spacy.en.download all\n"
|
"\npython -m spacy.%s.download all\n"
|
||||||
"to install the data")
|
"to install the data" % self.vocab.lang)
|
||||||
|
|
||||||
|
yield from self.noun_chunks_iterator
|
||||||
|
|
||||||
|
def __set__(self, DocIterator):
|
||||||
|
self.noun_chunks_iterator = DocIterator(self)
|
||||||
|
|
||||||
cdef const TokenC* word
|
|
||||||
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
|
|
||||||
'attr', 'root']
|
|
||||||
np_deps = [self.vocab.strings[label] for label in labels]
|
|
||||||
conj = self.vocab.strings['conj']
|
|
||||||
np_label = self.vocab.strings['NP']
|
|
||||||
for i in range(self.length):
|
|
||||||
word = &self.c[i]
|
|
||||||
if word.pos == NOUN and word.dep in np_deps:
|
|
||||||
yield Span(self, word.l_edge, i+1, label=np_label)
|
|
||||||
elif word.pos == NOUN and word.dep == conj:
|
|
||||||
head = word+word.head
|
|
||||||
while head.dep == conj and head.head < 0:
|
|
||||||
head += head.head
|
|
||||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
|
||||||
if head.dep in np_deps:
|
|
||||||
yield Span(self, word.l_edge, i+1, label=np_label)
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def sents(self):
|
def sents(self):
|
||||||
|
@ -267,8 +258,8 @@ cdef class Doc:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"sentence boundary detection requires the dependency parse, which "
|
"sentence boundary detection requires the dependency parse, which "
|
||||||
"requires data to be installed. If you haven't done so, run: "
|
"requires data to be installed. If you haven't done so, run: "
|
||||||
"\npython -m spacy.en.download all\n"
|
"\npython -m spacy.%s.download all\n"
|
||||||
"to install the data")
|
"to install the data" % self.vocab.lang)
|
||||||
cdef int i
|
cdef int i
|
||||||
start = 0
|
start = 0
|
||||||
for i in range(1, self.length):
|
for i in range(1, self.length):
|
||||||
|
|
|
@ -18,10 +18,10 @@ from ..attrs cimport POS, LEMMA, TAG, DEP
|
||||||
from ..parts_of_speech cimport CONJ, PUNCT
|
from ..parts_of_speech cimport CONJ, PUNCT
|
||||||
|
|
||||||
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||||
from ..attrs cimport FLAG14 as IS_BRACKET
|
from ..attrs cimport IS_BRACKET
|
||||||
from ..attrs cimport FLAG15 as IS_QUOTE
|
from ..attrs cimport IS_QUOTE
|
||||||
from ..attrs cimport FLAG16 as IS_LEFT_PUNCT
|
from ..attrs cimport IS_LEFT_PUNCT
|
||||||
from ..attrs cimport FLAG17 as IS_RIGHT_PUNCT
|
from ..attrs cimport IS_RIGHT_PUNCT
|
||||||
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
||||||
from ..attrs cimport IS_OOV
|
from ..attrs cimport IS_OOV
|
||||||
|
|
||||||
|
@ -95,6 +95,10 @@ cdef class Token:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.lex.prob
|
return self.c.lex.prob
|
||||||
|
|
||||||
|
property lang:
|
||||||
|
def __get__(self):
|
||||||
|
return self.c.lex.lang
|
||||||
|
|
||||||
property idx:
|
property idx:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.idx
|
return self.c.idx
|
||||||
|
@ -161,8 +165,8 @@ cdef class Token:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Word vectors set to length 0. This may be because the "
|
"Word vectors set to length 0. This may be because the "
|
||||||
"data is not installed. If you haven't already, run"
|
"data is not installed. If you haven't already, run"
|
||||||
"\npython -m spacy.en.download all\n"
|
"\npython -m spacy.%s.download all\n"
|
||||||
"to install the data."
|
"to install the data." % self.vocab.lang
|
||||||
)
|
)
|
||||||
vector_view = <float[:length,]>self.c.lex.vector
|
vector_view = <float[:length,]>self.c.lex.vector
|
||||||
return numpy.asarray(vector_view)
|
return numpy.asarray(vector_view)
|
||||||
|
@ -177,23 +181,11 @@ cdef class Token:
|
||||||
|
|
||||||
property n_lefts:
|
property n_lefts:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
cdef int n = 0
|
return self.c.l_kids
|
||||||
cdef const TokenC* ptr = self.c - self.i
|
|
||||||
while ptr != self.c:
|
|
||||||
if ptr + ptr.head == self.c:
|
|
||||||
n += 1
|
|
||||||
ptr += 1
|
|
||||||
return n
|
|
||||||
|
|
||||||
property n_rights:
|
property n_rights:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
cdef int n = 0
|
return self.c.r_kids
|
||||||
cdef const TokenC* ptr = self.c + (self.array_len - self.i)
|
|
||||||
while ptr != self.c:
|
|
||||||
if ptr + ptr.head == self.c:
|
|
||||||
n += 1
|
|
||||||
ptr -= 1
|
|
||||||
return n
|
|
||||||
|
|
||||||
property lefts:
|
property lefts:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -415,6 +407,10 @@ cdef class Token:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.lex.suffix]
|
return self.vocab.strings[self.c.lex.suffix]
|
||||||
|
|
||||||
|
property lang_:
|
||||||
|
def __get__(self):
|
||||||
|
return self.vocab.strings[self.c.lex.lang]
|
||||||
|
|
||||||
property lemma_:
|
property lemma_:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.lemma]
|
return self.vocab.strings[self.c.lemma]
|
||||||
|
|
|
@ -26,7 +26,7 @@ from . import symbols
|
||||||
|
|
||||||
from cymem.cymem cimport Address
|
from cymem.cymem cimport Address
|
||||||
from .serialize.packer cimport Packer
|
from .serialize.packer cimport Packer
|
||||||
from .attrs cimport PROB
|
from .attrs cimport PROB, LANG
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import copy_reg
|
import copy_reg
|
||||||
|
@ -104,6 +104,13 @@ cdef class Vocab:
|
||||||
self._serializer = Packer(self, self.serializer_freqs)
|
self._serializer = Packer(self, self.serializer_freqs)
|
||||||
return self._serializer
|
return self._serializer
|
||||||
|
|
||||||
|
property lang:
|
||||||
|
def __get__(self):
|
||||||
|
langfunc = None
|
||||||
|
if self.get_lex_attr:
|
||||||
|
langfunc = self.get_lex_attr.get(LANG,None)
|
||||||
|
return langfunc('_') if langfunc else ''
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
"""The current number of lexemes stored."""
|
"""The current number of lexemes stored."""
|
||||||
return self.length
|
return self.length
|
||||||
|
@ -245,6 +252,7 @@ cdef class Vocab:
|
||||||
fp.write_from(&lexeme.prob, sizeof(lexeme.prob), 1)
|
fp.write_from(&lexeme.prob, sizeof(lexeme.prob), 1)
|
||||||
fp.write_from(&lexeme.sentiment, sizeof(lexeme.sentiment), 1)
|
fp.write_from(&lexeme.sentiment, sizeof(lexeme.sentiment), 1)
|
||||||
fp.write_from(&lexeme.l2_norm, sizeof(lexeme.l2_norm), 1)
|
fp.write_from(&lexeme.l2_norm, sizeof(lexeme.l2_norm), 1)
|
||||||
|
fp.write_from(&lexeme.lang, sizeof(lexeme.lang), 1)
|
||||||
fp.close()
|
fp.close()
|
||||||
|
|
||||||
def load_lexemes(self, loc):
|
def load_lexemes(self, loc):
|
||||||
|
@ -277,6 +285,7 @@ cdef class Vocab:
|
||||||
fp.read_into(&lexeme.prob, 1, sizeof(lexeme.prob))
|
fp.read_into(&lexeme.prob, 1, sizeof(lexeme.prob))
|
||||||
fp.read_into(&lexeme.sentiment, 1, sizeof(lexeme.sentiment))
|
fp.read_into(&lexeme.sentiment, 1, sizeof(lexeme.sentiment))
|
||||||
fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm))
|
fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm))
|
||||||
|
fp.read_into(&lexeme.lang, 1, sizeof(lexeme.lang))
|
||||||
|
|
||||||
lexeme.vector = EMPTY_VEC
|
lexeme.vector = EMPTY_VEC
|
||||||
py_str = self.strings[lexeme.orth]
|
py_str = self.strings[lexeme.orth]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user