mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-13 17:52:31 +03:00
This PR adds official support for Haitian Creole (ht) to spaCy's spacy/lang module. It includes: Added all core language data files for spacy/lang/ht: tokenizer_exceptions.py punctuation.py lex_attrs.py syntax_iterators.py lemmatizer.py stop_words.py tag_map.py Unit tests for tokenizer and noun chunking (test_tokenizer.py, test_noun_chunking.py, etc.). Passed all 58 pytest spacy/tests/lang/ht tests that I've created. Basic tokenizer rules adapted for Haitian Creole orthography and informal contractions. Custom like_num atrribute supporting Haitian number formats (e.g., "3yèm"). Support for common informal apostrophe usage (e.g., "m'ap", "n'ap", "di'm"). Ensured no breakages in other language modules. Followed spaCy coding style (PEP8, Black). This provides a foundation for Haitian Creole NLP development using spaCy.
75 lines
2.3 KiB
Python
75 lines
2.3 KiB
Python
from typing import Iterator, Tuple, Union
|
|
|
|
from ...errors import Errors
|
|
from ...symbols import NOUN, PRON, PROPN
|
|
from ...tokens import Doc, Span
|
|
|
|
|
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
|
"""
|
|
Detect base noun phrases from a dependency parse for Haitian Creole.
|
|
Works on both Doc and Span objects.
|
|
"""
|
|
|
|
# Core nominal dependencies common in Haitian Creole
|
|
labels = [
|
|
"nsubj",
|
|
"obj",
|
|
"obl",
|
|
"nmod",
|
|
"appos",
|
|
"ROOT",
|
|
]
|
|
|
|
# Modifiers to optionally include in chunk (to the right)
|
|
post_modifiers = ["compound", "flat", "flat:name", "fixed"]
|
|
|
|
doc = doclike.doc
|
|
if not doc.has_annotation("DEP"):
|
|
raise ValueError(Errors.E029)
|
|
|
|
np_deps = {doc.vocab.strings.add(label) for label in labels}
|
|
np_mods = {doc.vocab.strings.add(mod) for mod in post_modifiers}
|
|
conj_label = doc.vocab.strings.add("conj")
|
|
np_label = doc.vocab.strings.add("NP")
|
|
adp_pos = doc.vocab.strings.add("ADP")
|
|
cc_pos = doc.vocab.strings.add("CCONJ")
|
|
|
|
prev_end = -1
|
|
for i, word in enumerate(doclike):
|
|
if word.pos not in (NOUN, PROPN, PRON):
|
|
continue
|
|
if word.left_edge.i <= prev_end:
|
|
continue
|
|
|
|
if word.dep in np_deps:
|
|
right_end = word
|
|
# expand to include known modifiers to the right
|
|
for child in word.rights:
|
|
if child.dep in np_mods:
|
|
right_end = child.right_edge
|
|
elif child.pos == NOUN:
|
|
right_end = child.right_edge
|
|
|
|
left_index = word.left_edge.i
|
|
# Skip prepositions at the start
|
|
if word.left_edge.pos == adp_pos:
|
|
left_index += 1
|
|
|
|
prev_end = right_end.i
|
|
yield left_index, right_end.i + 1, np_label
|
|
|
|
elif word.dep == conj_label:
|
|
head = word.head
|
|
while head.dep == conj_label and head.head.i < head.i:
|
|
head = head.head
|
|
if head.dep in np_deps:
|
|
left_index = word.left_edge.i
|
|
if word.left_edge.pos == cc_pos:
|
|
left_index += 1
|
|
prev_end = word.i
|
|
yield left_index, word.i + 1, np_label
|
|
|
|
|
|
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|