introduce lang field for LexemeC to hold language id

put noun_chunk logic into iterators.py for each language separately
This commit is contained in:
Wolfgang Seeker 2016-03-10 13:01:34 +01:00
parent bc9c62e279
commit 03fb498dbe
16 changed files with 103 additions and 70 deletions

View File

@ -109,7 +109,7 @@ def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200):
else:
file_ = loc.open()
for i, line in enumerate(file_):
freq, doc_freq, key = line.split('\t', 2)
freq, doc_freq, key = line.rstrip().split('\t', 2)
freq = int(freq)
counts.inc(i+1, freq)
total += freq
@ -121,7 +121,7 @@ def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200):
file_ = loc.open()
probs = {}
for line in file_:
freq, doc_freq, key = line.split('\t', 2)
freq, doc_freq, key = line.rstrip().split('\t', 2)
doc_freq = int(doc_freq)
freq = int(freq)
if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:

View File

@ -56,14 +56,15 @@ MOD_NAMES = [
'spacy.tokens.doc',
'spacy.tokens.span',
'spacy.tokens.token',
'spacy.tokens.npchunks',
'spacy.serialize.packer',
'spacy.serialize.huffman',
'spacy.serialize.bits',
'spacy.cfile',
'spacy.matcher',
'spacy.syntax.ner',
'spacy.symbols']
'spacy.symbols',
'spacy.en.iterators',
'spacy.de.iterators']
# By subclassing build_extensions we have the actual compiler that will be used

View File

@ -14,12 +14,12 @@ cpdef enum attr_id_t:
LIKE_EMAIL
IS_STOP
IS_OOV
IS_BRACKET
IS_QUOTE
IS_LEFT_PUNCT
IS_RIGHT_PUNCT
FLAG14 = 14
FLAG15
FLAG16
FLAG17
FLAG18
FLAG18 = 18
FLAG19
FLAG20
FLAG21
@ -86,10 +86,6 @@ cpdef enum attr_id_t:
SPACY
PROB
# Move these up to FLAG14--FLAG18 once we finish the functionality and
# are ready to regenerate the model
#IS_BRACKET
#IS_QUOTE
#IS_LEFT_PUNCT
#IS_RIGHT_PUNCT
LANG

View File

@ -13,10 +13,10 @@ IDS = {
"LIKE_EMAIL": LIKE_EMAIL,
"IS_STOP": IS_STOP,
"IS_OOV": IS_OOV,
"FLAG14": FLAG14,
"FLAG15": FLAG15,
"FLAG16": FLAG16,
"FLAG17": FLAG17,
"IS_BRACKET": IS_BRACKET,
"IS_QUOTE": IS_QUOTE,
"IS_LEFT_PUNCT": IS_LEFT_PUNCT,
"IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
"FLAG18": FLAG18,
"FLAG19": FLAG19,
"FLAG20": FLAG20,
@ -83,6 +83,7 @@ IDS = {
"HEAD": HEAD,
"SPACY": SPACY,
"PROB": PROB,
"LANG": LANG,
}
# ATTR IDs, in order of the symbol

View File

@ -1,31 +1,9 @@
from spacy.structs cimport TokenC
from spacy.tokens.span cimport Span
from ..structs cimport TokenC
from .doc cimport Doc
from .span cimport Span
from spacy.parts_of_speech cimport NOUN
from ..parts_of_speech cimport NOUN, PROPN, PRON
def english(Span sent):
cdef const TokenC* word
strings = sent.doc.vocab.strings
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root']
np_deps = [strings[label] for label in labels]
conj = strings['conj']
np_label = strings['NP']
for i in range(sent.start, sent.end):
word = &sent.doc.c[i]
if word.pos == NOUN and word.dep in np_deps:
yield Span(sent.doc, word.l_edge, i+1, label=np_label)
elif word.pos == NOUN and word.dep == conj:
head = word+word.head
while head.dep == conj and head.head < 0:
head += head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
yield Span(sent.doc, word.l_edge, i+1, label=np_label)
def german(Span sent):
def noun_chunks(Span sent):
# this function extracts spans headed by NOUNs starting from the left-most
# syntactic dependent until the NOUN itself
# for close apposition and measurement construction, the span is sometimes
@ -48,7 +26,3 @@ def german(Span sent):
if rdep.pos == NOUN and rdep.dep == close_app:
rbracket = rdep.i+1
yield Span(sent.doc, word.l_edge, rbracket, label=np_label)

0
spacy/en/iterators.pxd Normal file
View File

24
spacy/en/iterators.pyx Normal file
View File

@ -0,0 +1,24 @@
from spacy.structs cimport TokenC
from spacy.tokens.span cimport Span
from spacy.parts_of_speech cimport NOUN
def noun_chunks(Span sent):
cdef const TokenC* word
strings = sent.doc.vocab.strings
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root']
np_deps = [strings[label] for label in labels]
conj = strings['conj']
np_label = strings['NP']
for i in range(sent.start, sent.end):
word = &sent.doc.c[i]
if word.pos == NOUN and word.dep in np_deps:
yield Span(sent.doc, word.l_edge, i+1, label=np_label)
elif word.pos == NOUN and word.dep == conj:
head = word+word.head
while head.dep == conj and head.head < 0:
head += head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
yield Span(sent.doc, word.l_edge, i+1, label=np_label)

View File

@ -69,6 +69,7 @@ class Language(object):
attrs.SUFFIX: cls.suffix,
attrs.CLUSTER: cls.cluster,
attrs.PROB: lambda string: oov_prob,
attrs.LANG: lambda string: cls.lang,
attrs.IS_ALPHA: orth.is_alpha,
attrs.IS_ASCII: orth.is_ascii,
attrs.IS_DIGIT: cls.is_digit,
@ -77,10 +78,10 @@ class Language(object):
attrs.IS_SPACE: cls.is_space,
attrs.IS_TITLE: orth.is_title,
attrs.IS_UPPER: orth.is_upper,
attrs.FLAG14: orth.is_bracket,
attrs.FLAG15: orth.is_quote,
attrs.FLAG16: orth.is_left_punct,
attrs.FLAG17: orth.is_right_punct,
attrs.IS_BRACKET: orth.is_bracket,
attrs.IS_QUOTE: orth.is_quote,
attrs.IS_LEFT_PUNCT: orth.is_left_punct,
attrs.IS_RIGHT_PUNCT: orth.is_right_punct,
attrs.LIKE_URL: orth.like_url,
attrs.LIKE_NUM: orth.like_number,
attrs.LIKE_EMAIL: orth.like_email,

View File

@ -1,6 +1,6 @@
from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t
from .attrs cimport attr_id_t
from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, LANG
from .structs cimport LexemeC
from .strings cimport StringStore
@ -41,6 +41,8 @@ cdef class Lexeme:
lex.suffix = value
elif name == CLUSTER:
lex.cluster = value
elif name == LANG:
lex.lang = value
@staticmethod
cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
@ -67,6 +69,8 @@ cdef class Lexeme:
return lex.length
elif feat_name == CLUSTER:
return lex.cluster
elif feat_name == LANG:
return lex.lang
else:
return 0

View File

@ -18,10 +18,10 @@ import numpy
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from .attrs cimport FLAG14 as IS_BRACKET
from .attrs cimport FLAG15 as IS_QUOTE
from .attrs cimport FLAG16 as IS_LEFT_PUNCT
from .attrs cimport FLAG17 as IS_RIGHT_PUNCT
from .attrs cimport IS_BRACKET
from .attrs cimport IS_QUOTE
from .attrs cimport IS_LEFT_PUNCT
from .attrs cimport IS_RIGHT_PUNCT
from .attrs cimport IS_OOV
@ -123,6 +123,10 @@ cdef class Lexeme:
def __get__(self): return self.c.cluster
def __set__(self, int x): self.c.cluster = x
property lang:
def __get__(self): return self.c.lang
def __set__(self, int x): self.c.lang = x
property prob:
def __get__(self): return self.c.prob
def __set__(self, float x): self.c.prob = x
@ -147,6 +151,10 @@ cdef class Lexeme:
def __get__(self): return self.vocab.strings[self.c.suffix]
def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x]
property lang_:
def __get__(self): return self.vocab.strings[self.c.lang]
def __set__(self, unicode x): self.c.lang = self.vocab.strings[x]
property flags:
def __get__(self): return self.c.flags
def __set__(self, flags_t x): self.c.flags = x

View File

@ -40,17 +40,17 @@ cpdef bint is_bracket(unicode string):
cpdef bint is_quote(unicode string):
quotes = ('"',"'",'`','«','»','','','','','','','','','','','','')
quotes = ('"',"'",'`','«','»','','','','','','','','','','','','',"''",'``')
return string in quotes
cpdef bint is_left_punct(unicode string):
left_punct = ('(','[','{','<','"',"'",'«','','','','','','','','')
left_punct = ('(','[','{','<','"',"'",'«','','','','','','','','','``')
return string in left_punct
cpdef bint is_right_punct(unicode string):
right_punct = (')',']','}','>','"',"'",'»','','','','')
right_punct = (')',']','}','>','"',"'",'»','','','','',"''")
return string in right_punct

View File

@ -9,6 +9,8 @@ cdef struct LexemeC:
flags_t flags
attr_t lang
attr_t id
attr_t length

View File

@ -8,6 +8,7 @@ import struct
cimport numpy as np
import math
import six
import warnings
from ..lexeme cimport Lexeme
from ..lexeme cimport EMPTY_LEXEME
@ -23,7 +24,6 @@ from .token cimport Token
from ..serialize.bits cimport BitArray
from ..util import normalize_slice
import npchunks
DEF PADDING = 5
@ -241,11 +241,23 @@ cdef class Doc:
"\npython -m spacy.en.download all\n"
"to install the data")
chunk_rules = {'en':npchunks.english, 'de':npchunks.german}
from spacy.en.iterators import noun_chunks as en_noun_chunks
from spacy.de.iterators import noun_chunks as de_noun_chunks
chunk_rules = {'en':en_noun_chunks,
'de':de_noun_chunks,
}
for sent in self.sents:
lang = 'en' # todo: make dependent on language of root token
for chunk in chunk_rules.get(lang)(sent):
print(sent)
lang = sent.root.lang_
chunker = chunk_rules.get(lang,None)
if chunker == None:
warnings.warn("noun_chunks is not available for language %s." % lang)
print(sent.root.orth_)
continue
for chunk in chunker(sent):
yield chunk

View File

@ -18,10 +18,10 @@ from ..attrs cimport POS, LEMMA, TAG, DEP
from ..parts_of_speech cimport CONJ, PUNCT
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from ..attrs cimport FLAG14 as IS_BRACKET
from ..attrs cimport FLAG15 as IS_QUOTE
from ..attrs cimport FLAG16 as IS_LEFT_PUNCT
from ..attrs cimport FLAG17 as IS_RIGHT_PUNCT
from ..attrs cimport IS_BRACKET
from ..attrs cimport IS_QUOTE
from ..attrs cimport IS_LEFT_PUNCT
from ..attrs cimport IS_RIGHT_PUNCT
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from ..attrs cimport IS_OOV
@ -95,6 +95,10 @@ cdef class Token:
def __get__(self):
return self.c.lex.prob
property lang:
def __get__(self):
return self.c.lex.lang
property idx:
def __get__(self):
return self.c.idx
@ -310,6 +314,10 @@ cdef class Token:
def __get__(self):
return self.vocab.strings[self.c.lex.suffix]
property lang_:
def __get__(self):
return self.vocab.strings[self.c.lex.lang]
property lemma_:
def __get__(self):
return self.vocab.strings[self.c.lemma]

View File

@ -246,6 +246,7 @@ cdef class Vocab:
fp.write_from(&lexeme.prob, sizeof(lexeme.prob), 1)
fp.write_from(&lexeme.sentiment, sizeof(lexeme.sentiment), 1)
fp.write_from(&lexeme.l2_norm, sizeof(lexeme.l2_norm), 1)
fp.write_from(&lexeme.lang, sizeof(lexeme.lang), 1)
fp.close()
def load_lexemes(self, loc):
@ -278,6 +279,7 @@ cdef class Vocab:
fp.read_into(&lexeme.prob, 1, sizeof(lexeme.prob))
fp.read_into(&lexeme.sentiment, 1, sizeof(lexeme.sentiment))
fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm))
fp.read_into(&lexeme.lang, 1, sizeof(lexeme.lang))
lexeme.vector = EMPTY_VEC
py_str = self.strings[lexeme.orth]