Fix Spanish noun_chunks (resolves #2210)

Make sure 'NP' label is added to StringStore and move noun_bounds helper into a closure to allow reusing label sets
2026-03-05 12:21:27 +03:00 · 2018-04-18 18:44:01 -04:00 · 2018-04-18 18:44:01 -04:00 · 686225eadd
commit 686225eadd
parent 9632595fb4
1 changed files with 19 additions and 19 deletions
--- a/spacy/lang/es/syntax_iterators.py
+++ b/spacy/lang/es/syntax_iterators.py
@ -6,13 +6,30 @@ from ...symbols import NOUN, PROPN, PRON, VERB, AUX

 def noun_chunks(obj):
    doc = obj.doc
-    np_label = doc.vocab.strings['NP']
-    left_labels = ['det', 'fixed', 'neg'] #['nunmod', 'det', 'appos', 'fixed']
+    np_label = doc.vocab.strings.add('NP')
+    left_labels = ['det', 'fixed', 'neg'] # ['nunmod', 'det', 'appos', 'fixed']
    right_labels = ['flat', 'fixed', 'compound', 'neg']
    stop_labels = ['punct']
    np_left_deps = [doc.vocab.strings[label] for label in left_labels]
    np_right_deps = [doc.vocab.strings[label] for label in right_labels]
    stop_deps = [doc.vocab.strings[label] for label in stop_labels]
+
+    def noun_bounds(root):
+        left_bound = root
+        for token in reversed(list(root.lefts)):
+            if token.dep in np_left_deps:
+                left_bound = token
+        right_bound = root
+        for token in root.rights:
+            if (token.dep in np_right_deps):
+                left, right = noun_bounds(token)
+                if list(filter(lambda t: is_verb_token(t) or t.dep in stop_deps,
+                            doc[left_bound.i: right.i])):
+                    break
+                else:
+                    right_bound = right
+        return left_bound, right_bound
+
    token = doc[0]
    while token and token.i < len(doc):
        if token.pos in [PROPN, NOUN, PRON]:
@ -33,23 +50,6 @@ def next_token(token):
        return None


-def noun_bounds(root):
-    left_bound = root
-    for token in reversed(list(root.lefts)):
-        if token.dep in np_left_deps:
-            left_bound = token
-    right_bound = root
-    for token in root.rights:
-        if (token.dep in np_right_deps):
-            left, right = noun_bounds(token)
-            if list(filter(lambda t: is_verb_token(t) or t.dep in stop_deps,
-                           doc[left_bound.i: right.i])):
-                break
-            else:
-                right_bound = right
-    return left_bound, right_bound
-
-
 SYNTAX_ITERATORS = {
    'noun_chunks': noun_chunks
 }