From 5b385e7d78fd955d97b59024645d2592bdbc0949 Mon Sep 17 00:00:00 2001 From: Francisco Aranda Date: Fri, 2 Jun 2017 08:14:06 +0200 Subject: [PATCH 1/2] feat(spanish model): add the spanish noun chunker --- spacy/syntax/iterators.pyx | 55 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 2 deletions(-) diff --git a/spacy/syntax/iterators.pyx b/spacy/syntax/iterators.pyx index e1c44da7f..b0d1c78ca 100644 --- a/spacy/syntax/iterators.pyx +++ b/spacy/syntax/iterators.pyx @@ -1,7 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals -from ..parts_of_speech cimport NOUN, PROPN, PRON +from ..parts_of_speech cimport NOUN, PROPN, PRON, VERB, AUX def english_noun_chunks(obj): @@ -66,4 +66,55 @@ def german_noun_chunks(obj): yield word.left_edge.i, rbracket, np_label -CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks} +def es_noun_chunks(obj): + + doc = obj.doc + np_label = doc.vocab.strings['NP'] + + left_labels = ['det', 'fixed', 'neg'] #['nunmod', 'det', 'appos', 'fixed'] + right_labels = ['flat', 'fixed', 'compound', 'neg'] + stop_labels = ['punct'] + + np_left_deps = [doc.vocab.strings[label] for label in left_labels] + np_right_deps = [doc.vocab.strings[label] for label in right_labels] + stop_deps = [doc.vocab.strings[label] for label in stop_labels] + + def next_token(token): + try: + return token.nbor() + except: + return None + + def noun_bounds(root): + + def is_verb_token(token): + return token.pos in [VERB, AUX] + + left_bound = root + for token in reversed(list(root.lefts)): + if token.dep in np_left_deps: + left_bound = token + + right_bound = root + for token in root.rights: + if (token.dep in np_right_deps): + left, right = noun_bounds(token) + + if list(filter(lambda t: is_verb_token(t) or t.dep in stop_deps, doc[left_bound.i: right.i])): + break + else: + right_bound = right + + return left_bound, right_bound + + + token = doc[0] + while token and token.i < len(doc): + if token.pos in [PROPN, NOUN, PRON]: + left, right = noun_bounds(token) + yield left.i, right.i+1, np_label + token = right + token = next_token(token) + + +CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks, 'es': es_noun_chunks} From 70a21801994d7c9023f050ecfa2e3ec8a5d52d04 Mon Sep 17 00:00:00 2001 From: Francisco Aranda Date: Fri, 2 Jun 2017 08:19:57 +0200 Subject: [PATCH 2/2] fix(spanish sentence segmentation): remove tokenizer exceptions the break sentence segmentation. Aligned with training corpus --- spacy/es/tokenizer_exceptions.py | 33 ++------------------------------ 1 file changed, 2 insertions(+), 31 deletions(-) diff --git a/spacy/es/tokenizer_exceptions.py b/spacy/es/tokenizer_exceptions.py index e60bcd104..fb274f907 100644 --- a/spacy/es/tokenizer_exceptions.py +++ b/spacy/es/tokenizer_exceptions.py @@ -6,44 +6,15 @@ from ..language_data import PRON_LEMMA, DET_LEMMA TOKENIZER_EXCEPTIONS = { - "al": [ - {ORTH: "a", LEMMA: "a", TAG: ADP}, - {ORTH: "el", LEMMA: "el", TAG: DET} - ], - - "consigo": [ - {ORTH: "con", LEMMA: "con"}, - {ORTH: "sigo", LEMMA: PRON_LEMMA, NORM: "sí"} - ], - - "conmigo": [ - {ORTH: "con", LEMMA: "con"}, - {ORTH: "migo", LEMMA: PRON_LEMMA, NORM: "mí"} - ], - - "contigo": [ - {ORTH: "con", LEMMA: "con"}, - {ORTH: "tigo", LEMMA: PRON_LEMMA, NORM: "ti"} - ], - - "del": [ - {ORTH: "de", LEMMA: "de", TAG: ADP}, - {ORTH: "l", LEMMA: "el", TAG: DET} - ], - - "pel": [ - {ORTH: "pe", LEMMA: "per", TAG: ADP}, - {ORTH: "l", LEMMA: "el", TAG: DET} - ], "pal": [ {ORTH: "pa", LEMMA: "para"}, - {ORTH: "l", LEMMA: DET_LEMMA, NORM: "el"} + {ORTH: "el", LEMMA: DET_LEMMA, NORM: "el"} ], "pala": [ {ORTH: "pa", LEMMA: "para"}, - {ORTH: "la", LEMMA: DET_LEMMA} + {ORTH: "la", LEMMA: DET_LEMMA, NORM: "la"} ], "aprox.": [