mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
add new files npchunks.{pyx,pxd} to hold noun phrase chunk generators
This commit is contained in:
parent
1508528c8c
commit
d9312bc9ea
2
setup.py
2
setup.py
|
@ -56,6 +56,7 @@ MOD_NAMES = [
|
||||||
'spacy.tokens.doc',
|
'spacy.tokens.doc',
|
||||||
'spacy.tokens.span',
|
'spacy.tokens.span',
|
||||||
'spacy.tokens.token',
|
'spacy.tokens.token',
|
||||||
|
'spacy.tokens.npchunks',
|
||||||
'spacy.serialize.packer',
|
'spacy.serialize.packer',
|
||||||
'spacy.serialize.huffman',
|
'spacy.serialize.huffman',
|
||||||
'spacy.serialize.bits',
|
'spacy.serialize.bits',
|
||||||
|
@ -184,3 +185,4 @@ def setup_package():
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
setup_package()
|
setup_package()
|
||||||
|
|
||||||
|
|
|
@ -23,6 +23,7 @@ from .token cimport Token
|
||||||
from ..serialize.bits cimport BitArray
|
from ..serialize.bits cimport BitArray
|
||||||
from ..util import normalize_slice
|
from ..util import normalize_slice
|
||||||
|
|
||||||
|
import npchunks
|
||||||
|
|
||||||
DEF PADDING = 5
|
DEF PADDING = 5
|
||||||
|
|
||||||
|
@ -239,24 +240,15 @@ cdef class Doc:
|
||||||
"requires data to be installed. If you haven't done so, run: "
|
"requires data to be installed. If you haven't done so, run: "
|
||||||
"\npython -m spacy.en.download all\n"
|
"\npython -m spacy.en.download all\n"
|
||||||
"to install the data")
|
"to install the data")
|
||||||
|
|
||||||
cdef const TokenC* word
|
chunk_rules = {'en':npchunks.english, 'de':npchunks.german}
|
||||||
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
|
|
||||||
'attr', 'root']
|
for sent in self.sents:
|
||||||
np_deps = [self.vocab.strings[label] for label in labels]
|
lang = 'en' # todo: make dependent on language of root token
|
||||||
conj = self.vocab.strings['conj']
|
for chunk in chunk_rules.get(lang)(sent):
|
||||||
np_label = self.vocab.strings['NP']
|
yield chunk
|
||||||
for i in range(self.length):
|
|
||||||
word = &self.c[i]
|
|
||||||
if word.pos == NOUN and word.dep in np_deps:
|
|
||||||
yield Span(self, word.l_edge, i+1, label=np_label)
|
|
||||||
elif word.pos == NOUN and word.dep == conj:
|
|
||||||
head = word+word.head
|
|
||||||
while head.dep == conj and head.head < 0:
|
|
||||||
head += head.head
|
|
||||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
|
||||||
if head.dep in np_deps:
|
|
||||||
yield Span(self, word.l_edge, i+1, label=np_label)
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def sents(self):
|
def sents(self):
|
||||||
|
|
0
spacy/tokens/npchunks.pxd
Normal file
0
spacy/tokens/npchunks.pxd
Normal file
54
spacy/tokens/npchunks.pyx
Normal file
54
spacy/tokens/npchunks.pyx
Normal file
|
@ -0,0 +1,54 @@
|
||||||
|
|
||||||
|
from ..structs cimport TokenC
|
||||||
|
from .doc cimport Doc
|
||||||
|
from .span cimport Span
|
||||||
|
|
||||||
|
from ..parts_of_speech cimport NOUN, PROPN, PRON
|
||||||
|
|
||||||
|
def english(Span sent):
|
||||||
|
cdef const TokenC* word
|
||||||
|
strings = sent.doc.vocab.strings
|
||||||
|
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root']
|
||||||
|
np_deps = [strings[label] for label in labels]
|
||||||
|
conj = strings['conj']
|
||||||
|
np_label = strings['NP']
|
||||||
|
for i in range(sent.start, sent.end):
|
||||||
|
word = &sent.doc.c[i]
|
||||||
|
if word.pos == NOUN and word.dep in np_deps:
|
||||||
|
yield Span(sent.doc, word.l_edge, i+1, label=np_label)
|
||||||
|
elif word.pos == NOUN and word.dep == conj:
|
||||||
|
head = word+word.head
|
||||||
|
while head.dep == conj and head.head < 0:
|
||||||
|
head += head.head
|
||||||
|
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||||
|
if head.dep in np_deps:
|
||||||
|
yield Span(sent.doc, word.l_edge, i+1, label=np_label)
|
||||||
|
|
||||||
|
|
||||||
|
def german(Span sent):
|
||||||
|
# this function extracts spans headed by NOUNs starting from the left-most
|
||||||
|
# syntactic dependent until the NOUN itself
|
||||||
|
# for close apposition and measurement construction, the span is sometimes
|
||||||
|
# extended to the right of the NOUN
|
||||||
|
# example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not
|
||||||
|
# just "eine Tasse", same for "das Thema Familie"
|
||||||
|
cdef const TokenC* word
|
||||||
|
strings = sent.doc.vocab.strings
|
||||||
|
labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'root', 'cj', 'pd', 'og', 'app']
|
||||||
|
close_app = strings['nk']
|
||||||
|
np_deps = [strings[label] for label in labels]
|
||||||
|
np_label = strings['NP']
|
||||||
|
for i in range(sent.start, sent.end):
|
||||||
|
word = &sent.doc.c[i]
|
||||||
|
if word.pos == NOUN and word.dep in np_deps:
|
||||||
|
rbracket = i+1
|
||||||
|
# try to extend the span to the right
|
||||||
|
# to capture close apposition/measurement constructions
|
||||||
|
for rdep in sent.doc[i].rights:
|
||||||
|
if rdep.pos == NOUN and rdep.dep == close_app:
|
||||||
|
rbracket = rdep.i+1
|
||||||
|
yield Span(sent.doc, word.l_edge, rbracket, label=np_label)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user