introduce lang field for LexemeC to hold language id

put noun_chunk logic into iterators.py for each language separately
This commit is contained in:
Wolfgang Seeker 2016-03-10 13:01:34 +01:00
parent bc9c62e279
commit 03fb498dbe
16 changed files with 103 additions and 70 deletions

View File

@ -109,7 +109,7 @@ def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200):
else: else:
file_ = loc.open() file_ = loc.open()
for i, line in enumerate(file_): for i, line in enumerate(file_):
freq, doc_freq, key = line.split('\t', 2) freq, doc_freq, key = line.rstrip().split('\t', 2)
freq = int(freq) freq = int(freq)
counts.inc(i+1, freq) counts.inc(i+1, freq)
total += freq total += freq
@ -121,7 +121,7 @@ def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200):
file_ = loc.open() file_ = loc.open()
probs = {} probs = {}
for line in file_: for line in file_:
freq, doc_freq, key = line.split('\t', 2) freq, doc_freq, key = line.rstrip().split('\t', 2)
doc_freq = int(doc_freq) doc_freq = int(doc_freq)
freq = int(freq) freq = int(freq)
if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length: if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:

View File

@ -56,14 +56,15 @@ MOD_NAMES = [
'spacy.tokens.doc', 'spacy.tokens.doc',
'spacy.tokens.span', 'spacy.tokens.span',
'spacy.tokens.token', 'spacy.tokens.token',
'spacy.tokens.npchunks',
'spacy.serialize.packer', 'spacy.serialize.packer',
'spacy.serialize.huffman', 'spacy.serialize.huffman',
'spacy.serialize.bits', 'spacy.serialize.bits',
'spacy.cfile', 'spacy.cfile',
'spacy.matcher', 'spacy.matcher',
'spacy.syntax.ner', 'spacy.syntax.ner',
'spacy.symbols'] 'spacy.symbols',
'spacy.en.iterators',
'spacy.de.iterators']
# By subclassing build_extensions we have the actual compiler that will be used # By subclassing build_extensions we have the actual compiler that will be used

View File

@ -14,12 +14,12 @@ cpdef enum attr_id_t:
LIKE_EMAIL LIKE_EMAIL
IS_STOP IS_STOP
IS_OOV IS_OOV
IS_BRACKET
FLAG14 = 14 IS_QUOTE
FLAG15 IS_LEFT_PUNCT
FLAG16 IS_RIGHT_PUNCT
FLAG17
FLAG18 FLAG18 = 18
FLAG19 FLAG19
FLAG20 FLAG20
FLAG21 FLAG21
@ -85,11 +85,7 @@ cpdef enum attr_id_t:
HEAD HEAD
SPACY SPACY
PROB PROB
LANG
# Move these up to FLAG14--FLAG18 once we finish the functionality and
# are ready to regenerate the model
#IS_BRACKET
#IS_QUOTE
#IS_LEFT_PUNCT
#IS_RIGHT_PUNCT

View File

@ -13,10 +13,10 @@ IDS = {
"LIKE_EMAIL": LIKE_EMAIL, "LIKE_EMAIL": LIKE_EMAIL,
"IS_STOP": IS_STOP, "IS_STOP": IS_STOP,
"IS_OOV": IS_OOV, "IS_OOV": IS_OOV,
"FLAG14": FLAG14, "IS_BRACKET": IS_BRACKET,
"FLAG15": FLAG15, "IS_QUOTE": IS_QUOTE,
"FLAG16": FLAG16, "IS_LEFT_PUNCT": IS_LEFT_PUNCT,
"FLAG17": FLAG17, "IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
"FLAG18": FLAG18, "FLAG18": FLAG18,
"FLAG19": FLAG19, "FLAG19": FLAG19,
"FLAG20": FLAG20, "FLAG20": FLAG20,
@ -83,6 +83,7 @@ IDS = {
"HEAD": HEAD, "HEAD": HEAD,
"SPACY": SPACY, "SPACY": SPACY,
"PROB": PROB, "PROB": PROB,
"LANG": LANG,
} }
# ATTR IDs, in order of the symbol # ATTR IDs, in order of the symbol

View File

@ -1,31 +1,9 @@
from spacy.structs cimport TokenC
from spacy.tokens.span cimport Span
from ..structs cimport TokenC from spacy.parts_of_speech cimport NOUN
from .doc cimport Doc
from .span cimport Span
from ..parts_of_speech cimport NOUN, PROPN, PRON def noun_chunks(Span sent):
def english(Span sent):
cdef const TokenC* word
strings = sent.doc.vocab.strings
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root']
np_deps = [strings[label] for label in labels]
conj = strings['conj']
np_label = strings['NP']
for i in range(sent.start, sent.end):
word = &sent.doc.c[i]
if word.pos == NOUN and word.dep in np_deps:
yield Span(sent.doc, word.l_edge, i+1, label=np_label)
elif word.pos == NOUN and word.dep == conj:
head = word+word.head
while head.dep == conj and head.head < 0:
head += head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
yield Span(sent.doc, word.l_edge, i+1, label=np_label)
def german(Span sent):
# this function extracts spans headed by NOUNs starting from the left-most # this function extracts spans headed by NOUNs starting from the left-most
# syntactic dependent until the NOUN itself # syntactic dependent until the NOUN itself
# for close apposition and measurement construction, the span is sometimes # for close apposition and measurement construction, the span is sometimes
@ -48,7 +26,3 @@ def german(Span sent):
if rdep.pos == NOUN and rdep.dep == close_app: if rdep.pos == NOUN and rdep.dep == close_app:
rbracket = rdep.i+1 rbracket = rdep.i+1
yield Span(sent.doc, word.l_edge, rbracket, label=np_label) yield Span(sent.doc, word.l_edge, rbracket, label=np_label)

0
spacy/en/iterators.pxd Normal file
View File

24
spacy/en/iterators.pyx Normal file
View File

@ -0,0 +1,24 @@
from spacy.structs cimport TokenC
from spacy.tokens.span cimport Span
from spacy.parts_of_speech cimport NOUN
def noun_chunks(Span sent):
cdef const TokenC* word
strings = sent.doc.vocab.strings
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root']
np_deps = [strings[label] for label in labels]
conj = strings['conj']
np_label = strings['NP']
for i in range(sent.start, sent.end):
word = &sent.doc.c[i]
if word.pos == NOUN and word.dep in np_deps:
yield Span(sent.doc, word.l_edge, i+1, label=np_label)
elif word.pos == NOUN and word.dep == conj:
head = word+word.head
while head.dep == conj and head.head < 0:
head += head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
yield Span(sent.doc, word.l_edge, i+1, label=np_label)

View File

@ -69,6 +69,7 @@ class Language(object):
attrs.SUFFIX: cls.suffix, attrs.SUFFIX: cls.suffix,
attrs.CLUSTER: cls.cluster, attrs.CLUSTER: cls.cluster,
attrs.PROB: lambda string: oov_prob, attrs.PROB: lambda string: oov_prob,
attrs.LANG: lambda string: cls.lang,
attrs.IS_ALPHA: orth.is_alpha, attrs.IS_ALPHA: orth.is_alpha,
attrs.IS_ASCII: orth.is_ascii, attrs.IS_ASCII: orth.is_ascii,
attrs.IS_DIGIT: cls.is_digit, attrs.IS_DIGIT: cls.is_digit,
@ -77,10 +78,10 @@ class Language(object):
attrs.IS_SPACE: cls.is_space, attrs.IS_SPACE: cls.is_space,
attrs.IS_TITLE: orth.is_title, attrs.IS_TITLE: orth.is_title,
attrs.IS_UPPER: orth.is_upper, attrs.IS_UPPER: orth.is_upper,
attrs.FLAG14: orth.is_bracket, attrs.IS_BRACKET: orth.is_bracket,
attrs.FLAG15: orth.is_quote, attrs.IS_QUOTE: orth.is_quote,
attrs.FLAG16: orth.is_left_punct, attrs.IS_LEFT_PUNCT: orth.is_left_punct,
attrs.FLAG17: orth.is_right_punct, attrs.IS_RIGHT_PUNCT: orth.is_right_punct,
attrs.LIKE_URL: orth.like_url, attrs.LIKE_URL: orth.like_url,
attrs.LIKE_NUM: orth.like_number, attrs.LIKE_NUM: orth.like_number,
attrs.LIKE_EMAIL: orth.like_email, attrs.LIKE_EMAIL: orth.like_email,

View File

@ -1,6 +1,6 @@
from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t
from .attrs cimport attr_id_t from .attrs cimport attr_id_t
from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, LANG
from .structs cimport LexemeC from .structs cimport LexemeC
from .strings cimport StringStore from .strings cimport StringStore
@ -41,6 +41,8 @@ cdef class Lexeme:
lex.suffix = value lex.suffix = value
elif name == CLUSTER: elif name == CLUSTER:
lex.cluster = value lex.cluster = value
elif name == LANG:
lex.lang = value
@staticmethod @staticmethod
cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil: cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
@ -67,6 +69,8 @@ cdef class Lexeme:
return lex.length return lex.length
elif feat_name == CLUSTER: elif feat_name == CLUSTER:
return lex.cluster return lex.cluster
elif feat_name == LANG:
return lex.lang
else: else:
return 0 return 0

View File

@ -18,10 +18,10 @@ import numpy
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from .attrs cimport FLAG14 as IS_BRACKET from .attrs cimport IS_BRACKET
from .attrs cimport FLAG15 as IS_QUOTE from .attrs cimport IS_QUOTE
from .attrs cimport FLAG16 as IS_LEFT_PUNCT from .attrs cimport IS_LEFT_PUNCT
from .attrs cimport FLAG17 as IS_RIGHT_PUNCT from .attrs cimport IS_RIGHT_PUNCT
from .attrs cimport IS_OOV from .attrs cimport IS_OOV
@ -123,6 +123,10 @@ cdef class Lexeme:
def __get__(self): return self.c.cluster def __get__(self): return self.c.cluster
def __set__(self, int x): self.c.cluster = x def __set__(self, int x): self.c.cluster = x
property lang:
def __get__(self): return self.c.lang
def __set__(self, int x): self.c.lang = x
property prob: property prob:
def __get__(self): return self.c.prob def __get__(self): return self.c.prob
def __set__(self, float x): self.c.prob = x def __set__(self, float x): self.c.prob = x
@ -147,6 +151,10 @@ cdef class Lexeme:
def __get__(self): return self.vocab.strings[self.c.suffix] def __get__(self): return self.vocab.strings[self.c.suffix]
def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x] def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x]
property lang_:
def __get__(self): return self.vocab.strings[self.c.lang]
def __set__(self, unicode x): self.c.lang = self.vocab.strings[x]
property flags: property flags:
def __get__(self): return self.c.flags def __get__(self): return self.c.flags
def __set__(self, flags_t x): self.c.flags = x def __set__(self, flags_t x): self.c.flags = x

View File

@ -40,17 +40,17 @@ cpdef bint is_bracket(unicode string):
cpdef bint is_quote(unicode string): cpdef bint is_quote(unicode string):
quotes = ('"',"'",'`','«','»','','','','','','','','','','','','') quotes = ('"',"'",'`','«','»','','','','','','','','','','','','',"''",'``')
return string in quotes return string in quotes
cpdef bint is_left_punct(unicode string): cpdef bint is_left_punct(unicode string):
left_punct = ('(','[','{','<','"',"'",'«','','','','','','','','') left_punct = ('(','[','{','<','"',"'",'«','','','','','','','','','``')
return string in left_punct return string in left_punct
cpdef bint is_right_punct(unicode string): cpdef bint is_right_punct(unicode string):
right_punct = (')',']','}','>','"',"'",'»','','','','') right_punct = (')',']','}','>','"',"'",'»','','','','',"''")
return string in right_punct return string in right_punct

View File

@ -9,6 +9,8 @@ cdef struct LexemeC:
flags_t flags flags_t flags
attr_t lang
attr_t id attr_t id
attr_t length attr_t length

View File

@ -8,6 +8,7 @@ import struct
cimport numpy as np cimport numpy as np
import math import math
import six import six
import warnings
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from ..lexeme cimport EMPTY_LEXEME from ..lexeme cimport EMPTY_LEXEME
@ -23,7 +24,6 @@ from .token cimport Token
from ..serialize.bits cimport BitArray from ..serialize.bits cimport BitArray
from ..util import normalize_slice from ..util import normalize_slice
import npchunks
DEF PADDING = 5 DEF PADDING = 5
@ -241,11 +241,23 @@ cdef class Doc:
"\npython -m spacy.en.download all\n" "\npython -m spacy.en.download all\n"
"to install the data") "to install the data")
chunk_rules = {'en':npchunks.english, 'de':npchunks.german} from spacy.en.iterators import noun_chunks as en_noun_chunks
from spacy.de.iterators import noun_chunks as de_noun_chunks
chunk_rules = {'en':en_noun_chunks,
'de':de_noun_chunks,
}
for sent in self.sents: for sent in self.sents:
lang = 'en' # todo: make dependent on language of root token print(sent)
for chunk in chunk_rules.get(lang)(sent): lang = sent.root.lang_
chunker = chunk_rules.get(lang,None)
if chunker == None:
warnings.warn("noun_chunks is not available for language %s." % lang)
print(sent.root.orth_)
continue
for chunk in chunker(sent):
yield chunk yield chunk

View File

@ -18,10 +18,10 @@ from ..attrs cimport POS, LEMMA, TAG, DEP
from ..parts_of_speech cimport CONJ, PUNCT from ..parts_of_speech cimport CONJ, PUNCT
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from ..attrs cimport FLAG14 as IS_BRACKET from ..attrs cimport IS_BRACKET
from ..attrs cimport FLAG15 as IS_QUOTE from ..attrs cimport IS_QUOTE
from ..attrs cimport FLAG16 as IS_LEFT_PUNCT from ..attrs cimport IS_LEFT_PUNCT
from ..attrs cimport FLAG17 as IS_RIGHT_PUNCT from ..attrs cimport IS_RIGHT_PUNCT
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from ..attrs cimport IS_OOV from ..attrs cimport IS_OOV
@ -95,6 +95,10 @@ cdef class Token:
def __get__(self): def __get__(self):
return self.c.lex.prob return self.c.lex.prob
property lang:
def __get__(self):
return self.c.lex.lang
property idx: property idx:
def __get__(self): def __get__(self):
return self.c.idx return self.c.idx
@ -310,6 +314,10 @@ cdef class Token:
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.lex.suffix] return self.vocab.strings[self.c.lex.suffix]
property lang_:
def __get__(self):
return self.vocab.strings[self.c.lex.lang]
property lemma_: property lemma_:
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.lemma] return self.vocab.strings[self.c.lemma]

View File

@ -246,6 +246,7 @@ cdef class Vocab:
fp.write_from(&lexeme.prob, sizeof(lexeme.prob), 1) fp.write_from(&lexeme.prob, sizeof(lexeme.prob), 1)
fp.write_from(&lexeme.sentiment, sizeof(lexeme.sentiment), 1) fp.write_from(&lexeme.sentiment, sizeof(lexeme.sentiment), 1)
fp.write_from(&lexeme.l2_norm, sizeof(lexeme.l2_norm), 1) fp.write_from(&lexeme.l2_norm, sizeof(lexeme.l2_norm), 1)
fp.write_from(&lexeme.lang, sizeof(lexeme.lang), 1)
fp.close() fp.close()
def load_lexemes(self, loc): def load_lexemes(self, loc):
@ -278,6 +279,7 @@ cdef class Vocab:
fp.read_into(&lexeme.prob, 1, sizeof(lexeme.prob)) fp.read_into(&lexeme.prob, 1, sizeof(lexeme.prob))
fp.read_into(&lexeme.sentiment, 1, sizeof(lexeme.sentiment)) fp.read_into(&lexeme.sentiment, 1, sizeof(lexeme.sentiment))
fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm)) fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm))
fp.read_into(&lexeme.lang, 1, sizeof(lexeme.lang))
lexeme.vector = EMPTY_VEC lexeme.vector = EMPTY_VEC
py_str = self.strings[lexeme.orth] py_str = self.strings[lexeme.orth]