mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-13 09:42:26 +03:00
Merge c015dd1fa6
into 41e07772dc
This commit is contained in:
commit
b84c131df0
|
@ -91,6 +91,9 @@ IDS = {
|
||||||
"MORPH": MORPH,
|
"MORPH": MORPH,
|
||||||
"IDX": IDX,
|
"IDX": IDX,
|
||||||
}
|
}
|
||||||
|
# Make these ints in Python, so that we don't get this unexpected 'flag' type
|
||||||
|
# This will match the behaviour before Cython 3
|
||||||
|
IDS = {name: int(value) for name, value in IDS.items()}
|
||||||
|
|
||||||
|
|
||||||
# ATTR IDs, in order of the symbol
|
# ATTR IDs, in order of the symbol
|
||||||
|
|
|
@ -5,11 +5,11 @@ from thinc.api import Model
|
||||||
from ...language import BaseDefaults, Language
|
from ...language import BaseDefaults, Language
|
||||||
from .lemmatizer import HaitianCreoleLemmatizer
|
from .lemmatizer import HaitianCreoleLemmatizer
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
class HaitianCreoleDefaults(BaseDefaults):
|
class HaitianCreoleDefaults(BaseDefaults):
|
||||||
|
@ -22,10 +22,12 @@ class HaitianCreoleDefaults(BaseDefaults):
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
|
|
||||||
|
|
||||||
class HaitianCreole(Language):
|
class HaitianCreole(Language):
|
||||||
lang = "ht"
|
lang = "ht"
|
||||||
Defaults = HaitianCreoleDefaults
|
Defaults = HaitianCreoleDefaults
|
||||||
|
|
||||||
|
|
||||||
@HaitianCreole.factory(
|
@HaitianCreole.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
|
@ -49,4 +51,5 @@ def make_lemmatizer(
|
||||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["HaitianCreole"]
|
__all__ = ["HaitianCreole"]
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
from typing import List, Tuple
|
from typing import List, Tuple
|
||||||
|
|
||||||
|
from ...lookups import Lookups
|
||||||
from ...pipeline import Lemmatizer
|
from ...pipeline import Lemmatizer
|
||||||
from ...tokens import Token
|
from ...tokens import Token
|
||||||
from ...lookups import Lookups
|
|
||||||
|
|
||||||
|
|
||||||
class HaitianCreoleLemmatizer(Lemmatizer):
|
class HaitianCreoleLemmatizer(Lemmatizer):
|
||||||
|
|
|
@ -49,6 +49,7 @@ NORM_MAP = {
|
||||||
"P": "Pa",
|
"P": "Pa",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
text = text.strip().lower()
|
text = text.strip().lower()
|
||||||
if text.startswith(("+", "-", "±", "~")):
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
|
@ -69,9 +70,11 @@ def like_num(text):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def norm_custom(text):
|
def norm_custom(text):
|
||||||
return NORM_MAP.get(text, text.lower())
|
return NORM_MAP.get(text, text.lower())
|
||||||
|
|
||||||
|
|
||||||
LEX_ATTRS = {
|
LEX_ATTRS = {
|
||||||
LIKE_NUM: like_num,
|
LIKE_NUM: like_num,
|
||||||
NORM: norm_custom,
|
NORM: norm_custom,
|
||||||
|
|
|
@ -4,10 +4,10 @@ from ..char_classes import (
|
||||||
ALPHA_UPPER,
|
ALPHA_UPPER,
|
||||||
CONCAT_QUOTES,
|
CONCAT_QUOTES,
|
||||||
HYPHENS,
|
HYPHENS,
|
||||||
LIST_PUNCT,
|
|
||||||
LIST_QUOTES,
|
|
||||||
LIST_ELLIPSES,
|
LIST_ELLIPSES,
|
||||||
LIST_ICONS,
|
LIST_ICONS,
|
||||||
|
LIST_PUNCT,
|
||||||
|
LIST_QUOTES,
|
||||||
merge_chars,
|
merge_chars,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -16,23 +16,37 @@ ELISION = "'’".replace(" ", "")
|
||||||
_prefixes_elision = "m n l y t k w"
|
_prefixes_elision = "m n l y t k w"
|
||||||
_prefixes_elision += " " + _prefixes_elision.upper()
|
_prefixes_elision += " " + _prefixes_elision.upper()
|
||||||
|
|
||||||
TOKENIZER_PREFIXES = LIST_PUNCT + LIST_QUOTES + [
|
TOKENIZER_PREFIXES = (
|
||||||
|
LIST_PUNCT
|
||||||
|
+ LIST_QUOTES
|
||||||
|
+ [
|
||||||
r"(?:({pe})[{el}])(?=[{a}])".format(
|
r"(?:({pe})[{el}])(?=[{a}])".format(
|
||||||
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
|
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
)
|
||||||
|
|
||||||
TOKENIZER_SUFFIXES = LIST_PUNCT + LIST_QUOTES + LIST_ELLIPSES + [
|
TOKENIZER_SUFFIXES = (
|
||||||
|
LIST_PUNCT
|
||||||
|
+ LIST_QUOTES
|
||||||
|
+ LIST_ELLIPSES
|
||||||
|
+ [
|
||||||
r"(?<=[0-9])%", # numbers like 10%
|
r"(?<=[0-9])%", # numbers like 10%
|
||||||
r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers
|
r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers
|
||||||
r"(?<=[{a}])['’]".format(a=ALPHA), # apostrophes after letters
|
r"(?<=[{a}])['’]".format(a=ALPHA), # apostrophes after letters
|
||||||
r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions
|
r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions
|
||||||
r"(?<=[{a}0-9])\)", # right parenthesis after letter/number
|
r"(?<=[{a}0-9])\)", # right parenthesis after letter/number
|
||||||
r"(?<=[{a}])\.(?=\s|$)".format(a=ALPHA), # period after letter if space or end of string
|
r"(?<=[{a}])\.(?=\s|$)".format(
|
||||||
|
a=ALPHA
|
||||||
|
), # period after letter if space or end of string
|
||||||
r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis
|
r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis
|
||||||
]
|
]
|
||||||
|
)
|
||||||
|
|
||||||
TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [
|
TOKENIZER_INFIXES = (
|
||||||
|
LIST_ELLIPSES
|
||||||
|
+ LIST_ICONS
|
||||||
|
+ [
|
||||||
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||||
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||||
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||||
|
@ -40,4 +54,5 @@ TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [
|
||||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
|
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
|
||||||
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
|
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
|
||||||
]
|
]
|
||||||
|
)
|
||||||
|
|
|
@ -39,8 +39,7 @@ sa san si swa si
|
||||||
|
|
||||||
men mèsi oswa osinon
|
men mèsi oswa osinon
|
||||||
|
|
||||||
"""
|
""".split()
|
||||||
.split()
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add common contractions, with and without apostrophe variants
|
# Add common contractions, with and without apostrophe variants
|
||||||
|
|
|
@ -1,4 +1,22 @@
|
||||||
from spacy.symbols import NOUN, VERB, AUX, ADJ, ADV, PRON, DET, ADP, SCONJ, CCONJ, PART, INTJ, NUM, PROPN, PUNCT, SYM, X
|
from spacy.symbols import (
|
||||||
|
ADJ,
|
||||||
|
ADP,
|
||||||
|
ADV,
|
||||||
|
AUX,
|
||||||
|
CCONJ,
|
||||||
|
DET,
|
||||||
|
INTJ,
|
||||||
|
NOUN,
|
||||||
|
NUM,
|
||||||
|
PART,
|
||||||
|
PRON,
|
||||||
|
PROPN,
|
||||||
|
PUNCT,
|
||||||
|
SCONJ,
|
||||||
|
SYM,
|
||||||
|
VERB,
|
||||||
|
X,
|
||||||
|
)
|
||||||
|
|
||||||
TAG_MAP = {
|
TAG_MAP = {
|
||||||
"NOUN": {"pos": NOUN},
|
"NOUN": {"pos": NOUN},
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
from spacy.symbols import ORTH, NORM
|
from spacy.symbols import NORM, ORTH
|
||||||
|
|
||||||
|
|
||||||
def make_variants(base, first_norm, second_orth, second_norm):
|
def make_variants(base, first_norm, second_orth, second_norm):
|
||||||
return {
|
return {
|
||||||
|
@ -7,14 +8,16 @@ def make_variants(base, first_norm, second_orth, second_norm):
|
||||||
{ORTH: second_orth, NORM: second_norm},
|
{ORTH: second_orth, NORM: second_norm},
|
||||||
],
|
],
|
||||||
base.capitalize(): [
|
base.capitalize(): [
|
||||||
{ORTH: base.split("'")[0].capitalize() + "'", NORM: first_norm.capitalize()},
|
{
|
||||||
|
ORTH: base.split("'")[0].capitalize() + "'",
|
||||||
|
NORM: first_norm.capitalize(),
|
||||||
|
},
|
||||||
{ORTH: second_orth, NORM: second_norm},
|
{ORTH: second_orth, NORM: second_norm},
|
||||||
]
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = {
|
|
||||||
"Dr.": [{ORTH: "Dr."}]
|
TOKENIZER_EXCEPTIONS = {"Dr.": [{ORTH: "Dr."}]}
|
||||||
}
|
|
||||||
|
|
||||||
# Apostrophe forms
|
# Apostrophe forms
|
||||||
TOKENIZER_EXCEPTIONS.update(make_variants("m'ap", "mwen", "ap", "ap"))
|
TOKENIZER_EXCEPTIONS.update(make_variants("m'ap", "mwen", "ap", "ap"))
|
||||||
|
@ -29,7 +32,8 @@ TOKENIZER_EXCEPTIONS.update(make_variants("p'ap", "pa", "ap", "ap"))
|
||||||
TOKENIZER_EXCEPTIONS.update(make_variants("t'ap", "te", "ap", "ap"))
|
TOKENIZER_EXCEPTIONS.update(make_variants("t'ap", "te", "ap", "ap"))
|
||||||
|
|
||||||
# Non-apostrophe contractions (with capitalized variants)
|
# Non-apostrophe contractions (with capitalized variants)
|
||||||
TOKENIZER_EXCEPTIONS.update({
|
TOKENIZER_EXCEPTIONS.update(
|
||||||
|
{
|
||||||
"map": [
|
"map": [
|
||||||
{ORTH: "m", NORM: "mwen"},
|
{ORTH: "m", NORM: "mwen"},
|
||||||
{ORTH: "ap", NORM: "ap"},
|
{ORTH: "ap", NORM: "ap"},
|
||||||
|
@ -118,4 +122,5 @@ TOKENIZER_EXCEPTIONS.update({
|
||||||
{ORTH: "T", NORM: "Te"},
|
{ORTH: "T", NORM: "Te"},
|
||||||
{ORTH: "ap", NORM: "ap"},
|
{ORTH: "ap", NORM: "ap"},
|
||||||
],
|
],
|
||||||
})
|
}
|
||||||
|
)
|
||||||
|
|
|
@ -23,7 +23,9 @@ IDS = {
|
||||||
"SPACE": SPACE
|
"SPACE": SPACE
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Make these ints in Python, so that we don't get this unexpected 'flag' type
|
||||||
|
# This will match the behaviour before Cython 3
|
||||||
|
IDS = {name: int(value) for name, value in IDS.items()}
|
||||||
NAMES = {value: key for key, value in IDS.items()}
|
NAMES = {value: key for key, value in IDS.items()}
|
||||||
|
|
||||||
# As of Cython 3.1, the global Python namespace no longer has the enum
|
# As of Cython 3.1, the global Python namespace no longer has the enum
|
||||||
|
|
|
@ -29,4 +29,16 @@ def test_ht_tokenizer_handles_basic_abbreviation(ht_tokenizer, text):
|
||||||
def test_ht_tokenizer_full_sentence(ht_tokenizer):
|
def test_ht_tokenizer_full_sentence(ht_tokenizer):
|
||||||
text = "Si'm ka vini, m'ap pale ak li."
|
text = "Si'm ka vini, m'ap pale ak li."
|
||||||
tokens = [t.text for t in ht_tokenizer(text)]
|
tokens = [t.text for t in ht_tokenizer(text)]
|
||||||
assert tokens == ["Si", "'m", "ka", "vini", ",", "m'", "ap", "pale", "ak", "li", "."]
|
assert tokens == [
|
||||||
|
"Si",
|
||||||
|
"'m",
|
||||||
|
"ka",
|
||||||
|
"vini",
|
||||||
|
",",
|
||||||
|
"m'",
|
||||||
|
"ap",
|
||||||
|
"pale",
|
||||||
|
"ak",
|
||||||
|
"li",
|
||||||
|
".",
|
||||||
|
]
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -37,7 +37,9 @@ def test_ht_tokenizer_splits_uneven_wrap(ht_tokenizer, text):
|
||||||
assert len(tokens) == 5
|
assert len(tokens) == 5
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)])
|
@pytest.mark.parametrize(
|
||||||
|
"text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)]
|
||||||
|
)
|
||||||
def test_ht_tokenizer_splits_prefix_interact(ht_tokenizer, text, length):
|
def test_ht_tokenizer_splits_prefix_interact(ht_tokenizer, text, length):
|
||||||
tokens = ht_tokenizer(text)
|
tokens = ht_tokenizer(text)
|
||||||
assert len(tokens) == length
|
assert len(tokens) == length
|
||||||
|
|
|
@ -16,7 +16,6 @@ Nan Washington, Depatman Deta Etazini pibliye yon deklarasyon ki eksprime "regre
|
||||||
assert len(tokens) == 84
|
assert len(tokens) == 84
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"text,length",
|
"text,length",
|
||||||
[
|
[
|
||||||
|
@ -66,14 +65,14 @@ def test_ht_lex_attrs_capitals(word):
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"word, expected", [
|
"word, expected",
|
||||||
|
[
|
||||||
("'m", "mwen"),
|
("'m", "mwen"),
|
||||||
("'n", "nou"),
|
("'n", "nou"),
|
||||||
("'l", "li"),
|
("'l", "li"),
|
||||||
("'y", "yo"),
|
("'y", "yo"),
|
||||||
("'w", "ou"),
|
("'w", "ou"),
|
||||||
]
|
],
|
||||||
)
|
)
|
||||||
def test_ht_lex_attrs_norm_custom(word, expected):
|
def test_ht_lex_attrs_norm_custom(word, expected):
|
||||||
assert norm_custom(word) == expected
|
assert norm_custom(word) == expected
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user