Merge pull request #306 from wbwseeker/german_noun_chunks

add German noun chunk functionality
This commit is contained in:
Matthew Honnibal 2016-04-08 00:54:24 +10:00
commit 872695759d
15 changed files with 228 additions and 162 deletions

View File

@ -63,7 +63,8 @@ MOD_NAMES = [
'spacy.cfile',
'spacy.matcher',
'spacy.syntax.ner',
'spacy.symbols']
'spacy.symbols',
'spacy.syntax.iterators']
# By subclassing build_extensions we have the actual compiler that will be used
@ -213,3 +214,4 @@ def setup_package():
if __name__ == '__main__':
setup_package()

View File

@ -14,12 +14,12 @@ cpdef enum attr_id_t:
LIKE_EMAIL
IS_STOP
IS_OOV
FLAG14 = 14
FLAG15
FLAG16
FLAG17
FLAG18
IS_BRACKET
IS_QUOTE
IS_LEFT_PUNCT
IS_RIGHT_PUNCT
FLAG18 = 18
FLAG19
FLAG20
FLAG21
@ -85,11 +85,7 @@ cpdef enum attr_id_t:
HEAD
SPACY
PROB
LANG
# Move these up to FLAG14--FLAG18 once we finish the functionality and
# are ready to regenerate the model
#IS_BRACKET
#IS_QUOTE
#IS_LEFT_PUNCT
#IS_RIGHT_PUNCT

View File

@ -13,10 +13,10 @@ IDS = {
"LIKE_EMAIL": LIKE_EMAIL,
"IS_STOP": IS_STOP,
"IS_OOV": IS_OOV,
"FLAG14": FLAG14,
"FLAG15": FLAG15,
"FLAG16": FLAG16,
"FLAG17": FLAG17,
"IS_BRACKET": IS_BRACKET,
"IS_QUOTE": IS_QUOTE,
"IS_LEFT_PUNCT": IS_LEFT_PUNCT,
"IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
"FLAG18": FLAG18,
"FLAG19": FLAG19,
"FLAG20": FLAG20,
@ -83,6 +83,7 @@ IDS = {
"HEAD": HEAD,
"SPACY": SPACY,
"PROB": PROB,
"LANG": LANG,
}
# ATTR IDs, in order of the symbol

View File

@ -33,10 +33,6 @@ class Language(object):
@staticmethod
def norm(string):
return string
@staticmethod
def shape(string):
return orth.word_shape(string)
@staticmethod
def prefix(string):
@ -50,66 +46,14 @@ class Language(object):
def cluster(string):
return 0
@staticmethod
def is_alpha(string):
return orth.is_alpha(string)
@staticmethod
def is_ascii(string):
return orth.is_ascii(string)
@staticmethod
def is_digit(string):
return string.isdigit()
@staticmethod
def is_lower(string):
return orth.is_lower(string)
@staticmethod
def is_punct(string):
return orth.is_punct(string)
@staticmethod
def is_space(string):
return string.isspace()
@staticmethod
def is_title(string):
return orth.is_title(string)
@staticmethod
def is_bracket(string):
return orth.is_bracket(string)
@staticmethod
def is_quote(string):
return orth.is_quote(string)
@staticmethod
def is_left_punct(string):
return orth.is_left_punct(string)
@staticmethod
def is_right_punct(string):
return orth.is_right_punct(string)
@staticmethod
def is_upper(string):
return orth.is_upper(string)
@staticmethod
def like_url(string):
return orth.like_url(string)
@staticmethod
def like_num(string):
return orth.like_number(string)
@staticmethod
def like_email(string):
return orth.like_email(string)
@staticmethod
def is_stop(string):
return 0
@ -120,26 +64,27 @@ class Language(object):
return {
attrs.LOWER: cls.lower,
attrs.NORM: cls.norm,
attrs.SHAPE: cls.shape,
attrs.SHAPE: orth.word_shape,
attrs.PREFIX: cls.prefix,
attrs.SUFFIX: cls.suffix,
attrs.CLUSTER: cls.cluster,
attrs.PROB: lambda string: oov_prob,
attrs.IS_ALPHA: cls.is_alpha,
attrs.IS_ASCII: cls.is_ascii,
attrs.LANG: lambda string: cls.lang,
attrs.IS_ALPHA: orth.is_alpha,
attrs.IS_ASCII: orth.is_ascii,
attrs.IS_DIGIT: cls.is_digit,
attrs.IS_LOWER: cls.is_lower,
attrs.IS_PUNCT: cls.is_punct,
attrs.IS_LOWER: orth.is_lower,
attrs.IS_PUNCT: orth.is_punct,
attrs.IS_SPACE: cls.is_space,
attrs.IS_TITLE: cls.is_title,
attrs.IS_UPPER: cls.is_upper,
attrs.FLAG14: cls.is_bracket,
attrs.FLAG15: cls.is_quote,
attrs.FLAG16: cls.is_left_punct,
attrs.FLAG17: cls.is_right_punct,
attrs.LIKE_URL: cls.like_url,
attrs.LIKE_NUM: cls.like_num,
attrs.LIKE_EMAIL: cls.like_email,
attrs.IS_TITLE: orth.is_title,
attrs.IS_UPPER: orth.is_upper,
attrs.IS_BRACKET: orth.is_bracket,
attrs.IS_QUOTE: orth.is_quote,
attrs.IS_LEFT_PUNCT: orth.is_left_punct,
attrs.IS_RIGHT_PUNCT: orth.is_right_punct,
attrs.LIKE_URL: orth.like_url,
attrs.LIKE_NUM: orth.like_number,
attrs.LIKE_EMAIL: orth.like_email,
attrs.IS_STOP: cls.is_stop,
attrs.IS_OOV: lambda string: True
}

View File

@ -1,6 +1,6 @@
from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t
from .attrs cimport attr_id_t
from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, LANG
from .structs cimport LexemeC
from .strings cimport StringStore
@ -41,6 +41,8 @@ cdef class Lexeme:
lex.suffix = value
elif name == CLUSTER:
lex.cluster = value
elif name == LANG:
lex.lang = value
@staticmethod
cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
@ -67,6 +69,8 @@ cdef class Lexeme:
return lex.length
elif feat_name == CLUSTER:
return lex.cluster
elif feat_name == LANG:
return lex.lang
else:
return 0

View File

@ -18,10 +18,10 @@ import numpy
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from .attrs cimport FLAG14 as IS_BRACKET
from .attrs cimport FLAG15 as IS_QUOTE
from .attrs cimport FLAG16 as IS_LEFT_PUNCT
from .attrs cimport FLAG17 as IS_RIGHT_PUNCT
from .attrs cimport IS_BRACKET
from .attrs cimport IS_QUOTE
from .attrs cimport IS_LEFT_PUNCT
from .attrs cimport IS_RIGHT_PUNCT
from .attrs cimport IS_OOV
@ -74,8 +74,8 @@ cdef class Lexeme:
raise ValueError(
"Word vectors set to length 0. This may be because the "
"data is not installed. If you haven't already, run"
"\npython -m spacy.en.download all\n"
"to install the data."
"\npython -m spacy.%s.download all\n"
"to install the data." % self.vocab.lang
)
vector_view = <float[:length,]>self.c.vector
@ -123,6 +123,10 @@ cdef class Lexeme:
def __get__(self): return self.c.cluster
def __set__(self, int x): self.c.cluster = x
property lang:
def __get__(self): return self.c.lang
def __set__(self, int x): self.c.lang = x
property prob:
def __get__(self): return self.c.prob
def __set__(self, float x): self.c.prob = x
@ -147,6 +151,10 @@ cdef class Lexeme:
def __get__(self): return self.vocab.strings[self.c.suffix]
def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x]
property lang_:
def __get__(self): return self.vocab.strings[self.c.lang]
def __set__(self, unicode x): self.c.lang = self.vocab.strings[x]
property flags:
def __get__(self): return self.c.flags
def __set__(self, flags_t x): self.c.flags = x

View File

@ -5,9 +5,6 @@ import unicodedata
import re
TAGS = 'adj adp adv conj det noun num pdt pos pron prt punct verb'.upper().split()
# Binary string features
cpdef bint is_alpha(unicode string):
return string.isalpha()
@ -36,20 +33,25 @@ cpdef bint is_ascii(unicode string):
else:
return True
cpdef bint is_bracket(unicode string):
return False
brackets = ('(',')','[',']','{','}','<','>')
return string in brackets
cpdef bint is_quote(unicode string):
if string in ('"', "'"):
return True
else:
return False
quotes = ('"',"'",'`','«','»','','','','','','','','','','','','',"''",'``')
return string in quotes
cpdef bint is_left_punct(unicode string):
return False
left_punct = ('(','[','{','<','"',"'",'«','','','','','','','','','``')
return string in left_punct
cpdef bint is_right_punct(unicode string):
return False
right_punct = (')',']','}','>','"',"'",'»','','','','',"''")
return string in right_punct
cpdef bint is_title(unicode string):

View File

@ -9,6 +9,8 @@ cdef struct LexemeC:
flags_t flags
attr_t lang
attr_t id
attr_t length

View File

@ -0,0 +1,16 @@
from spacy.tokens.doc cimport Doc
cdef class DocIterator:
cdef Doc _doc
cdef class EnglishNounChunks(DocIterator):
cdef int i
cdef int _np_label
cdef set _np_deps
cdef class GermanNounChunks(DocIterator):
cdef int i
cdef int _np_label
cdef set _np_deps
cdef int _close_app

View File

@ -0,0 +1,82 @@
from spacy.structs cimport TokenC
from spacy.tokens.span cimport Span
from spacy.tokens.doc cimport Doc
from spacy.tokens.token cimport Token
from spacy.parts_of_speech cimport NOUN
# base class for document iterators
cdef class DocIterator:
def __init__(self, Doc doc):
self._doc = doc
def __iter__(self):
return self
def __next__(self):
raise NotImplementedError
cdef class EnglishNounChunks(DocIterator):
def __init__(self, Doc doc):
super(EnglishNounChunks,self).__init__(doc)
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root']
self._np_label = self._doc.vocab.strings['NP']
self._np_deps = set( self._doc.vocab.strings[label] for label in labels )
self._conjunct = self._doc.vocab.strings['conj']
self.i = 0
def __next__(self):
cdef const TokenC* word
cdef widx
while self.i < self._doc.length:
widx = self.i
self.i += 1
word = &self._doc.c[widx]
if word.pos == NOUN:
if word.dep in self._np_deps:
return Span(self._doc, word.l_edge, widx+1, label=self._np_label)
elif word.dep == self._conjunct:
head = word+word.head
while head.dep == self._conjunct and head.head < 0:
head += head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in self._np_deps:
return Span(self._doc, word.l_edge, widx+1, label=self._np_label)
raise StopIteration
# this iterator extracts spans headed by NOUNs starting from the left-most
# syntactic dependent until the NOUN itself
# for close apposition and measurement construction, the span is sometimes
# extended to the right of the NOUN
# example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not
# just "eine Tasse", same for "das Thema Familie"
cdef class GermanNounChunks(DocIterator):
def __init__(self, Doc doc):
super(GermanNounChunks,self).__init__(doc)
labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'root', 'cj', 'pd', 'og', 'app']
self._np_label = self._doc.vocab.strings['NP']
self._np_deps = set( self._doc.vocab.strings[label] for label in labels )
self._close_app = self._doc.vocab.strings['nk']
self.i = 0
def __next__(self):
cdef const TokenC* word
cdef int rbracket
cdef Token rdep
cdef widx
while self.i < self._doc.length:
widx = self.i
self.i += 1
word = &self._doc.c[widx]
if word.pos == NOUN and word.dep in self._np_deps:
rbracket = widx+1
# try to extend the span to the right
# to capture close apposition/measurement constructions
for rdep in self._doc[widx].rights:
if rdep.pos == NOUN and rdep.dep == self._close_app:
rbracket = rdep.i+1
return Span(self._doc, word.l_edge, rbracket, label=self._np_label)
raise StopIteration

View File

@ -47,6 +47,8 @@ from ._parse_features cimport fill_context
from .stateclass cimport StateClass
from ._state cimport StateC
from spacy.syntax.iterators cimport DocIterator, EnglishNounChunks, GermanNounChunks
CHUNKERS = {'en':EnglishNounChunks, 'de':GermanNounChunks}
DEBUG = False
@ -113,12 +115,9 @@ cdef class Parser:
cdef int nr_feat = self.model.nr_feat
with nogil:
self.parseC(tokens.c, tokens.length, nr_feat, nr_class)
tokens.is_parsed = True
# Check for KeyboardInterrupt etc. Untested
PyErr_CheckSignals()
# projectivize output
if self._projectivize:
PseudoProjectivity.deprojectivize(tokens)
self._finalize(tokens)
def pipe(self, stream, int batch_size=1000, int n_threads=2):
cdef Pool mem = Pool()
@ -144,7 +143,7 @@ cdef class Parser:
raise ValueError("Error parsing doc: %s" % sent_str)
PyErr_CheckSignals()
for doc in queue:
doc.is_parsed = True
self._finalize(doc)
yield doc
queue = []
batch_size = len(queue)
@ -155,10 +154,19 @@ cdef class Parser:
with gil:
sent_str = queue[i].text
raise ValueError("Error parsing doc: %s" % sent_str)
for doc in queue:
doc.is_parsed = True
yield doc
PyErr_CheckSignals()
for doc in queue:
self._finalize(doc)
yield doc
def _finalize(self, Doc doc):
# deprojectivize output
if self._projectivize:
PseudoProjectivity.deprojectivize(doc)
# set annotation-specific iterators
doc.noun_chunks = CHUNKERS.get(doc.vocab.lang,DocIterator)
# mark doc as parsed
doc.is_parsed = True
cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil:
cdef ExampleC eg

View File

@ -7,6 +7,8 @@ from ..structs cimport TokenC, LexemeC
from ..typedefs cimport attr_t
from ..attrs cimport attr_id_t
from spacy.syntax.iterators cimport DocIterator
cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil
@ -42,6 +44,8 @@ cdef class Doc:
cdef int length
cdef int max_length
cdef DocIterator noun_chunks_iterator
cdef int push_back(self, LexemeOrToken lex_or_tok, bint trailing_space) except -1
cpdef np.ndarray to_array(self, object features)

View File

@ -8,6 +8,7 @@ import struct
cimport numpy as np
import math
import six
import warnings
from ..lexeme cimport Lexeme
from ..lexeme cimport EMPTY_LEXEME
@ -80,6 +81,7 @@ cdef class Doc:
self.is_parsed = False
self._py_tokens = []
self._vector = None
self.noun_chunks_iterator = DocIterator(self)
def __getitem__(self, object i):
"""Get a Token or a Span from the Doc.
@ -230,33 +232,22 @@ cdef class Doc:
# Set start as B
self.c[start].ent_iob = 3
@property
def noun_chunks(self):
"""Yield spans for base noun phrases."""
if not self.is_parsed:
raise ValueError(
"noun_chunks requires the dependency parse, which "
"requires data to be installed. If you haven't done so, run: "
"\npython -m spacy.en.download all\n"
"to install the data")
cdef const TokenC* word
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
'attr', 'root']
np_deps = [self.vocab.strings[label] for label in labels]
conj = self.vocab.strings['conj']
np_label = self.vocab.strings['NP']
for i in range(self.length):
word = &self.c[i]
if word.pos == NOUN and word.dep in np_deps:
yield Span(self, word.l_edge, i+1, label=np_label)
elif word.pos == NOUN and word.dep == conj:
head = word+word.head
while head.dep == conj and head.head < 0:
head += head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
yield Span(self, word.l_edge, i+1, label=np_label)
property noun_chunks:
def __get__(self):
"""Yield spans for base noun phrases."""
if not self.is_parsed:
raise ValueError(
"noun_chunks requires the dependency parse, which "
"requires data to be installed. If you haven't done so, run: "
"\npython -m spacy.%s.download all\n"
"to install the data" % self.vocab.lang)
yield from self.noun_chunks_iterator
def __set__(self, DocIterator):
self.noun_chunks_iterator = DocIterator(self)
@property
def sents(self):
@ -267,8 +258,8 @@ cdef class Doc:
raise ValueError(
"sentence boundary detection requires the dependency parse, which "
"requires data to be installed. If you haven't done so, run: "
"\npython -m spacy.en.download all\n"
"to install the data")
"\npython -m spacy.%s.download all\n"
"to install the data" % self.vocab.lang)
cdef int i
start = 0
for i in range(1, self.length):

View File

@ -18,10 +18,10 @@ from ..attrs cimport POS, LEMMA, TAG, DEP
from ..parts_of_speech cimport CONJ, PUNCT
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from ..attrs cimport FLAG14 as IS_BRACKET
from ..attrs cimport FLAG15 as IS_QUOTE
from ..attrs cimport FLAG16 as IS_LEFT_PUNCT
from ..attrs cimport FLAG17 as IS_RIGHT_PUNCT
from ..attrs cimport IS_BRACKET
from ..attrs cimport IS_QUOTE
from ..attrs cimport IS_LEFT_PUNCT
from ..attrs cimport IS_RIGHT_PUNCT
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from ..attrs cimport IS_OOV
@ -95,6 +95,10 @@ cdef class Token:
def __get__(self):
return self.c.lex.prob
property lang:
def __get__(self):
return self.c.lex.lang
property idx:
def __get__(self):
return self.c.idx
@ -161,8 +165,8 @@ cdef class Token:
raise ValueError(
"Word vectors set to length 0. This may be because the "
"data is not installed. If you haven't already, run"
"\npython -m spacy.en.download all\n"
"to install the data."
"\npython -m spacy.%s.download all\n"
"to install the data." % self.vocab.lang
)
vector_view = <float[:length,]>self.c.lex.vector
return numpy.asarray(vector_view)
@ -177,23 +181,11 @@ cdef class Token:
property n_lefts:
def __get__(self):
cdef int n = 0
cdef const TokenC* ptr = self.c - self.i
while ptr != self.c:
if ptr + ptr.head == self.c:
n += 1
ptr += 1
return n
return self.c.l_kids
property n_rights:
def __get__(self):
cdef int n = 0
cdef const TokenC* ptr = self.c + (self.array_len - self.i)
while ptr != self.c:
if ptr + ptr.head == self.c:
n += 1
ptr -= 1
return n
return self.c.r_kids
property lefts:
def __get__(self):
@ -415,6 +407,10 @@ cdef class Token:
def __get__(self):
return self.vocab.strings[self.c.lex.suffix]
property lang_:
def __get__(self):
return self.vocab.strings[self.c.lex.lang]
property lemma_:
def __get__(self):
return self.vocab.strings[self.c.lemma]

View File

@ -26,7 +26,7 @@ from . import symbols
from cymem.cymem cimport Address
from .serialize.packer cimport Packer
from .attrs cimport PROB
from .attrs cimport PROB, LANG
try:
import copy_reg
@ -104,6 +104,13 @@ cdef class Vocab:
self._serializer = Packer(self, self.serializer_freqs)
return self._serializer
property lang:
def __get__(self):
langfunc = None
if self.get_lex_attr:
langfunc = self.get_lex_attr.get(LANG,None)
return langfunc('_') if langfunc else ''
def __len__(self):
"""The current number of lexemes stored."""
return self.length
@ -245,6 +252,7 @@ cdef class Vocab:
fp.write_from(&lexeme.prob, sizeof(lexeme.prob), 1)
fp.write_from(&lexeme.sentiment, sizeof(lexeme.sentiment), 1)
fp.write_from(&lexeme.l2_norm, sizeof(lexeme.l2_norm), 1)
fp.write_from(&lexeme.lang, sizeof(lexeme.lang), 1)
fp.close()
def load_lexemes(self, loc):
@ -277,6 +285,7 @@ cdef class Vocab:
fp.read_into(&lexeme.prob, 1, sizeof(lexeme.prob))
fp.read_into(&lexeme.sentiment, 1, sizeof(lexeme.sentiment))
fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm))
fp.read_into(&lexeme.lang, 1, sizeof(lexeme.lang))
lexeme.vector = EMPTY_VEC
py_str = self.strings[lexeme.orth]