mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-12 17:22:25 +03:00
Merge c015dd1fa6
into 41e07772dc
This commit is contained in:
commit
b84c131df0
|
@ -91,6 +91,9 @@ IDS = {
|
|||
"MORPH": MORPH,
|
||||
"IDX": IDX,
|
||||
}
|
||||
# Make these ints in Python, so that we don't get this unexpected 'flag' type
|
||||
# This will match the behaviour before Cython 3
|
||||
IDS = {name: int(value) for name, value in IDS.items()}
|
||||
|
||||
|
||||
# ATTR IDs, in order of the symbol
|
||||
|
|
|
@ -5,11 +5,11 @@ from thinc.api import Model
|
|||
from ...language import BaseDefaults, Language
|
||||
from .lemmatizer import HaitianCreoleLemmatizer
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .tag_map import TAG_MAP
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
|
||||
|
||||
class HaitianCreoleDefaults(BaseDefaults):
|
||||
|
@ -22,10 +22,12 @@ class HaitianCreoleDefaults(BaseDefaults):
|
|||
stop_words = STOP_WORDS
|
||||
tag_map = TAG_MAP
|
||||
|
||||
|
||||
class HaitianCreole(Language):
|
||||
lang = "ht"
|
||||
Defaults = HaitianCreoleDefaults
|
||||
|
||||
|
||||
@HaitianCreole.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
|
@ -49,4 +51,5 @@ def make_lemmatizer(
|
|||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["HaitianCreole"]
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
from typing import List, Tuple
|
||||
|
||||
from ...lookups import Lookups
|
||||
from ...pipeline import Lemmatizer
|
||||
from ...tokens import Token
|
||||
from ...lookups import Lookups
|
||||
|
||||
|
||||
class HaitianCreoleLemmatizer(Lemmatizer):
|
||||
|
|
|
@ -49,6 +49,7 @@ NORM_MAP = {
|
|||
"P": "Pa",
|
||||
}
|
||||
|
||||
|
||||
def like_num(text):
|
||||
text = text.strip().lower()
|
||||
if text.startswith(("+", "-", "±", "~")):
|
||||
|
@ -69,9 +70,11 @@ def like_num(text):
|
|||
return True
|
||||
return False
|
||||
|
||||
|
||||
def norm_custom(text):
|
||||
return NORM_MAP.get(text, text.lower())
|
||||
|
||||
|
||||
LEX_ATTRS = {
|
||||
LIKE_NUM: like_num,
|
||||
NORM: norm_custom,
|
||||
|
|
|
@ -4,10 +4,10 @@ from ..char_classes import (
|
|||
ALPHA_UPPER,
|
||||
CONCAT_QUOTES,
|
||||
HYPHENS,
|
||||
LIST_PUNCT,
|
||||
LIST_QUOTES,
|
||||
LIST_ELLIPSES,
|
||||
LIST_ICONS,
|
||||
LIST_PUNCT,
|
||||
LIST_QUOTES,
|
||||
merge_chars,
|
||||
)
|
||||
|
||||
|
@ -16,28 +16,43 @@ ELISION = "'’".replace(" ", "")
|
|||
_prefixes_elision = "m n l y t k w"
|
||||
_prefixes_elision += " " + _prefixes_elision.upper()
|
||||
|
||||
TOKENIZER_PREFIXES = LIST_PUNCT + LIST_QUOTES + [
|
||||
r"(?:({pe})[{el}])(?=[{a}])".format(
|
||||
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
|
||||
)
|
||||
]
|
||||
TOKENIZER_PREFIXES = (
|
||||
LIST_PUNCT
|
||||
+ LIST_QUOTES
|
||||
+ [
|
||||
r"(?:({pe})[{el}])(?=[{a}])".format(
|
||||
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
TOKENIZER_SUFFIXES = LIST_PUNCT + LIST_QUOTES + LIST_ELLIPSES + [
|
||||
r"(?<=[0-9])%", # numbers like 10%
|
||||
r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers
|
||||
r"(?<=[{a}])['’]".format(a=ALPHA), # apostrophes after letters
|
||||
r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions
|
||||
r"(?<=[{a}0-9])\)", # right parenthesis after letter/number
|
||||
r"(?<=[{a}])\.(?=\s|$)".format(a=ALPHA), # period after letter if space or end of string
|
||||
r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis
|
||||
]
|
||||
TOKENIZER_SUFFIXES = (
|
||||
LIST_PUNCT
|
||||
+ LIST_QUOTES
|
||||
+ LIST_ELLIPSES
|
||||
+ [
|
||||
r"(?<=[0-9])%", # numbers like 10%
|
||||
r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers
|
||||
r"(?<=[{a}])['’]".format(a=ALPHA), # apostrophes after letters
|
||||
r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions
|
||||
r"(?<=[{a}0-9])\)", # right parenthesis after letter/number
|
||||
r"(?<=[{a}])\.(?=\s|$)".format(
|
||||
a=ALPHA
|
||||
), # period after letter if space or end of string
|
||||
r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis
|
||||
]
|
||||
)
|
||||
|
||||
TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [
|
||||
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||
),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
|
||||
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
|
||||
]
|
||||
TOKENIZER_INFIXES = (
|
||||
LIST_ELLIPSES
|
||||
+ LIST_ICONS
|
||||
+ [
|
||||
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||
),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
|
||||
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
|
||||
]
|
||||
)
|
||||
|
|
|
@ -39,8 +39,7 @@ sa san si swa si
|
|||
|
||||
men mèsi oswa osinon
|
||||
|
||||
"""
|
||||
.split()
|
||||
""".split()
|
||||
)
|
||||
|
||||
# Add common contractions, with and without apostrophe variants
|
||||
|
|
|
@ -1,4 +1,22 @@
|
|||
from spacy.symbols import NOUN, VERB, AUX, ADJ, ADV, PRON, DET, ADP, SCONJ, CCONJ, PART, INTJ, NUM, PROPN, PUNCT, SYM, X
|
||||
from spacy.symbols import (
|
||||
ADJ,
|
||||
ADP,
|
||||
ADV,
|
||||
AUX,
|
||||
CCONJ,
|
||||
DET,
|
||||
INTJ,
|
||||
NOUN,
|
||||
NUM,
|
||||
PART,
|
||||
PRON,
|
||||
PROPN,
|
||||
PUNCT,
|
||||
SCONJ,
|
||||
SYM,
|
||||
VERB,
|
||||
X,
|
||||
)
|
||||
|
||||
TAG_MAP = {
|
||||
"NOUN": {"pos": NOUN},
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
from spacy.symbols import ORTH, NORM
|
||||
from spacy.symbols import NORM, ORTH
|
||||
|
||||
|
||||
def make_variants(base, first_norm, second_orth, second_norm):
|
||||
return {
|
||||
|
@ -7,14 +8,16 @@ def make_variants(base, first_norm, second_orth, second_norm):
|
|||
{ORTH: second_orth, NORM: second_norm},
|
||||
],
|
||||
base.capitalize(): [
|
||||
{ORTH: base.split("'")[0].capitalize() + "'", NORM: first_norm.capitalize()},
|
||||
{
|
||||
ORTH: base.split("'")[0].capitalize() + "'",
|
||||
NORM: first_norm.capitalize(),
|
||||
},
|
||||
{ORTH: second_orth, NORM: second_norm},
|
||||
]
|
||||
],
|
||||
}
|
||||
|
||||
TOKENIZER_EXCEPTIONS = {
|
||||
"Dr.": [{ORTH: "Dr."}]
|
||||
}
|
||||
|
||||
TOKENIZER_EXCEPTIONS = {"Dr.": [{ORTH: "Dr."}]}
|
||||
|
||||
# Apostrophe forms
|
||||
TOKENIZER_EXCEPTIONS.update(make_variants("m'ap", "mwen", "ap", "ap"))
|
||||
|
@ -29,93 +32,95 @@ TOKENIZER_EXCEPTIONS.update(make_variants("p'ap", "pa", "ap", "ap"))
|
|||
TOKENIZER_EXCEPTIONS.update(make_variants("t'ap", "te", "ap", "ap"))
|
||||
|
||||
# Non-apostrophe contractions (with capitalized variants)
|
||||
TOKENIZER_EXCEPTIONS.update({
|
||||
"map": [
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Map": [
|
||||
{ORTH: "M", NORM: "Mwen"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"lem": [
|
||||
{ORTH: "le", NORM: "le"},
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
],
|
||||
"Lem": [
|
||||
{ORTH: "Le", NORM: "Le"},
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
],
|
||||
"lew": [
|
||||
{ORTH: "le", NORM: "le"},
|
||||
{ORTH: "w", NORM: "ou"},
|
||||
],
|
||||
"Lew": [
|
||||
{ORTH: "Le", NORM: "Le"},
|
||||
{ORTH: "w", NORM: "ou"},
|
||||
],
|
||||
"nap": [
|
||||
{ORTH: "n", NORM: "nou"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Nap": [
|
||||
{ORTH: "N", NORM: "Nou"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"lap": [
|
||||
{ORTH: "l", NORM: "li"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Lap": [
|
||||
{ORTH: "L", NORM: "Li"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"yap": [
|
||||
{ORTH: "y", NORM: "yo"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Yap": [
|
||||
{ORTH: "Y", NORM: "Yo"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"mte": [
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
{ORTH: "te", NORM: "te"},
|
||||
],
|
||||
"Mte": [
|
||||
{ORTH: "M", NORM: "Mwen"},
|
||||
{ORTH: "te", NORM: "te"},
|
||||
],
|
||||
"mpral": [
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
{ORTH: "pral", NORM: "pral"},
|
||||
],
|
||||
"Mpral": [
|
||||
{ORTH: "M", NORM: "Mwen"},
|
||||
{ORTH: "pral", NORM: "pral"},
|
||||
],
|
||||
"wap": [
|
||||
{ORTH: "w", NORM: "ou"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Wap": [
|
||||
{ORTH: "W", NORM: "Ou"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"kap": [
|
||||
{ORTH: "k", NORM: "ki"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Kap": [
|
||||
{ORTH: "K", NORM: "Ki"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"tap": [
|
||||
{ORTH: "t", NORM: "te"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Tap": [
|
||||
{ORTH: "T", NORM: "Te"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
})
|
||||
TOKENIZER_EXCEPTIONS.update(
|
||||
{
|
||||
"map": [
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Map": [
|
||||
{ORTH: "M", NORM: "Mwen"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"lem": [
|
||||
{ORTH: "le", NORM: "le"},
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
],
|
||||
"Lem": [
|
||||
{ORTH: "Le", NORM: "Le"},
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
],
|
||||
"lew": [
|
||||
{ORTH: "le", NORM: "le"},
|
||||
{ORTH: "w", NORM: "ou"},
|
||||
],
|
||||
"Lew": [
|
||||
{ORTH: "Le", NORM: "Le"},
|
||||
{ORTH: "w", NORM: "ou"},
|
||||
],
|
||||
"nap": [
|
||||
{ORTH: "n", NORM: "nou"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Nap": [
|
||||
{ORTH: "N", NORM: "Nou"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"lap": [
|
||||
{ORTH: "l", NORM: "li"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Lap": [
|
||||
{ORTH: "L", NORM: "Li"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"yap": [
|
||||
{ORTH: "y", NORM: "yo"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Yap": [
|
||||
{ORTH: "Y", NORM: "Yo"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"mte": [
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
{ORTH: "te", NORM: "te"},
|
||||
],
|
||||
"Mte": [
|
||||
{ORTH: "M", NORM: "Mwen"},
|
||||
{ORTH: "te", NORM: "te"},
|
||||
],
|
||||
"mpral": [
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
{ORTH: "pral", NORM: "pral"},
|
||||
],
|
||||
"Mpral": [
|
||||
{ORTH: "M", NORM: "Mwen"},
|
||||
{ORTH: "pral", NORM: "pral"},
|
||||
],
|
||||
"wap": [
|
||||
{ORTH: "w", NORM: "ou"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Wap": [
|
||||
{ORTH: "W", NORM: "Ou"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"kap": [
|
||||
{ORTH: "k", NORM: "ki"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Kap": [
|
||||
{ORTH: "K", NORM: "Ki"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"tap": [
|
||||
{ORTH: "t", NORM: "te"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Tap": [
|
||||
{ORTH: "T", NORM: "Te"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
}
|
||||
)
|
||||
|
|
|
@ -23,7 +23,9 @@ IDS = {
|
|||
"SPACE": SPACE
|
||||
}
|
||||
|
||||
|
||||
# Make these ints in Python, so that we don't get this unexpected 'flag' type
|
||||
# This will match the behaviour before Cython 3
|
||||
IDS = {name: int(value) for name, value in IDS.items()}
|
||||
NAMES = {value: key for key, value in IDS.items()}
|
||||
|
||||
# As of Cython 3.1, the global Python namespace no longer has the enum
|
||||
|
|
|
@ -29,4 +29,16 @@ def test_ht_tokenizer_handles_basic_abbreviation(ht_tokenizer, text):
|
|||
def test_ht_tokenizer_full_sentence(ht_tokenizer):
|
||||
text = "Si'm ka vini, m'ap pale ak li."
|
||||
tokens = [t.text for t in ht_tokenizer(text)]
|
||||
assert tokens == ["Si", "'m", "ka", "vini", ",", "m'", "ap", "pale", "ak", "li", "."]
|
||||
assert tokens == [
|
||||
"Si",
|
||||
"'m",
|
||||
"ka",
|
||||
"vini",
|
||||
",",
|
||||
"m'",
|
||||
"ap",
|
||||
"pale",
|
||||
"ak",
|
||||
"li",
|
||||
".",
|
||||
]
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import pytest
|
||||
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
|
|
|
@ -37,7 +37,9 @@ def test_ht_tokenizer_splits_uneven_wrap(ht_tokenizer, text):
|
|||
assert len(tokens) == 5
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)])
|
||||
@pytest.mark.parametrize(
|
||||
"text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)]
|
||||
)
|
||||
def test_ht_tokenizer_splits_prefix_interact(ht_tokenizer, text, length):
|
||||
tokens = ht_tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
|
|
|
@ -16,7 +16,6 @@ Nan Washington, Depatman Deta Etazini pibliye yon deklarasyon ki eksprime "regre
|
|||
assert len(tokens) == 84
|
||||
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,length",
|
||||
[
|
||||
|
@ -66,14 +65,14 @@ def test_ht_lex_attrs_capitals(word):
|
|||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"word, expected", [
|
||||
"word, expected",
|
||||
[
|
||||
("'m", "mwen"),
|
||||
("'n", "nou"),
|
||||
("'l", "li"),
|
||||
("'y", "yo"),
|
||||
("'w", "ou"),
|
||||
]
|
||||
],
|
||||
)
|
||||
def test_ht_lex_attrs_norm_custom(word, expected):
|
||||
assert norm_custom(word) == expected
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user