mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-23 15:54:13 +03:00
Update LatinDefaults for lang 'la' (#12538)
* Add noun chunking to la syntax iterators * Expand list of numeral, ordinal words * Expand abbreviations in la tokenizer_exceptions * Add example sents * Update spacy/lang/la/syntax_iterators.py Reorganize la syntax iterators Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Minor updates based on review * fix call --------- Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
parent
b60b027927
commit
ab4ba04c32
|
@ -2,12 +2,14 @@ from ...language import Language, BaseDefaults
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
|
||||||
|
|
||||||
class LatinDefaults(BaseDefaults):
|
class LatinDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
|
||||||
|
|
||||||
class Latin(Language):
|
class Latin(Language):
|
||||||
|
|
22
spacy/lang/la/examples.py
Normal file
22
spacy/lang/la/examples.py
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.la.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
# > Caes. BG 1.1
|
||||||
|
# > Cic. De Amic. 1
|
||||||
|
# > V. Georg. 1.1-5
|
||||||
|
# > Gen. 1:1
|
||||||
|
# > Galileo, Sid. Nunc.
|
||||||
|
# > van Schurman, Opusc. arg. 1
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Gallia est omnis divisa in partes tres, quarum unam incolunt Belgae, aliam Aquitani, tertiam qui ipsorum lingua Celtae, nostra Galli appellantur.",
|
||||||
|
"Q. Mucius augur multa narrare de C. Laelio socero suo memoriter et iucunde solebat nec dubitare illum in omni sermone appellare sapientem.",
|
||||||
|
"Quid faciat laetas segetes, quo sidere terram uertere, Maecenas, ulmisque adiungere uitis conueniat, quae cura boum, qui cultus habendo sit pecori, apibus quanta experientia parcis, hinc canere incipiam",
|
||||||
|
"In principio creavit Deus caelum et terram.",
|
||||||
|
"Quo sumpto, intelligatur lunaris globus, cuius maximus circulus CAF, centrum vero E, dimetiens CF, qui ad Terre diametrum est ut duo ad septem.",
|
||||||
|
"Cuicunque natura indita sunt principia, seu potentiae principiorum omnium artium, ac scientiarum, ei conveniunt omnes artes ac scientiae.",
|
||||||
|
]
|
|
@ -6,17 +6,16 @@ roman_numerals_compile = re.compile(
|
||||||
r"(?i)^(?=[MDCLXVI])M*(C[MD]|D?C{0,4})(X[CL]|L?X{0,4})(I[XV]|V?I{0,4})$"
|
r"(?i)^(?=[MDCLXVI])M*(C[MD]|D?C{0,4})(X[CL]|L?X{0,4})(I[XV]|V?I{0,4})$"
|
||||||
)
|
)
|
||||||
|
|
||||||
_num_words = set(
|
_num_words = """unus una unum duo duae tres tria quattuor quinque sex septem octo novem decem undecim duodecim tredecim quattuordecim quindecim sedecim septendecim duodeviginti undeviginti viginti triginta quadraginta quinquaginta sexaginta septuaginta octoginta nonaginta centum ducenti ducentae ducenta trecenti trecentae trecenta quadringenti quadringentae quadringenta quingenti quingentae quingenta sescenti sescentae sescenta septingenti septingentae septingenta octingenti octingentae octingenta nongenti nongentae nongenta mille
|
||||||
"""
|
|
||||||
unus una unum duo duae tres tria quattuor quinque sex septem octo novem decem
|
|
||||||
""".split()
|
""".split()
|
||||||
)
|
|
||||||
|
|
||||||
_ordinal_words = set(
|
_num_words += [item.replace("v", "u") for item in _num_words]
|
||||||
"""
|
_num_words = set(_num_words)
|
||||||
primus prima primum secundus secunda secundum tertius tertia tertium
|
|
||||||
""".split()
|
_ordinal_words = """primus prima primum secundus secunda secundum tertius tertia tertium quartus quarta quartum quintus quinta quintum sextus sexta sextum septimus septima septimum octavus octava octavum nonus nona nonum decimus decima decimum undecimus undecima undecimum duodecimus duodecima duodecimum duodevicesimus duodevicesima duodevicesimum undevicesimus undevicesima undevicesimum vicesimus vicesima vicesimum tricesimus tricesima tricesimum quadragesimus quadragesima quadragesimum quinquagesimus quinquagesima quinquagesimum sexagesimus sexagesima sexagesimum septuagesimus septuagesima septuagesimum octogesimus octogesima octogesimum nonagesimus nonagesima nonagesimum centesimus centesima centesimum ducentesimus ducentesima ducentesimum trecentesimus trecentesima trecentesimum quadringentesimus quadringentesima quadringentesimum quingentesimus quingentesima quingentesimum sescentesimus sescentesima sescentesimum septingentesimus septingentesima septingentesimum octingentesimus octingentesima octingentesimum nongentesimus nongentesima nongentesimum millesimus millesima millesimum""".split()
|
||||||
)
|
|
||||||
|
_ordinal_words += [item.replace("v", "u") for item in _ordinal_words]
|
||||||
|
_ordinal_words = set(_ordinal_words)
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
|
|
85
spacy/lang/la/syntax_iterators.py
Normal file
85
spacy/lang/la/syntax_iterators.py
Normal file
|
@ -0,0 +1,85 @@
|
||||||
|
from typing import Union, Iterator, Tuple
|
||||||
|
from ...tokens import Doc, Span
|
||||||
|
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
|
||||||
|
from ...errors import Errors
|
||||||
|
|
||||||
|
# NB: Modified from da on suggestion from https://github.com/explosion/spaCy/issues/7457#issuecomment-800349751 [PJB]
|
||||||
|
|
||||||
|
|
||||||
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
|
def is_verb_token(tok):
|
||||||
|
return tok.pos in [VERB, AUX]
|
||||||
|
|
||||||
|
def get_left_bound(root):
|
||||||
|
left_bound = root
|
||||||
|
for tok in reversed(list(root.lefts)):
|
||||||
|
if tok.dep in np_left_deps:
|
||||||
|
left_bound = tok
|
||||||
|
return left_bound
|
||||||
|
|
||||||
|
def get_right_bound(doc, root):
|
||||||
|
right_bound = root
|
||||||
|
for tok in root.rights:
|
||||||
|
if tok.dep in np_right_deps:
|
||||||
|
right = get_right_bound(doc, tok)
|
||||||
|
if list(
|
||||||
|
filter(
|
||||||
|
lambda t: is_verb_token(t) or t.dep in stop_deps,
|
||||||
|
doc[root.i : right.i],
|
||||||
|
)
|
||||||
|
):
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
right_bound = right
|
||||||
|
return right_bound
|
||||||
|
|
||||||
|
def get_bounds(doc, root):
|
||||||
|
return get_left_bound(root), get_right_bound(doc, root)
|
||||||
|
|
||||||
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
|
|
||||||
|
if not doc.has_annotation("DEP"):
|
||||||
|
raise ValueError(Errors.E029)
|
||||||
|
|
||||||
|
if not len(doc):
|
||||||
|
return
|
||||||
|
|
||||||
|
left_labels = [
|
||||||
|
"det",
|
||||||
|
"fixed",
|
||||||
|
"nmod:poss",
|
||||||
|
"amod",
|
||||||
|
"flat",
|
||||||
|
"goeswith",
|
||||||
|
"nummod",
|
||||||
|
"appos",
|
||||||
|
]
|
||||||
|
right_labels = [
|
||||||
|
"fixed",
|
||||||
|
"nmod:poss",
|
||||||
|
"amod",
|
||||||
|
"flat",
|
||||||
|
"goeswith",
|
||||||
|
"nummod",
|
||||||
|
"appos",
|
||||||
|
"nmod",
|
||||||
|
"det",
|
||||||
|
]
|
||||||
|
stop_labels = ["punct"]
|
||||||
|
|
||||||
|
np_label = doc.vocab.strings.add("NP")
|
||||||
|
np_left_deps = [doc.vocab.strings.add(label) for label in left_labels]
|
||||||
|
np_right_deps = [doc.vocab.strings.add(label) for label in right_labels]
|
||||||
|
stop_deps = [doc.vocab.strings.add(label) for label in stop_labels]
|
||||||
|
|
||||||
|
prev_right = -1
|
||||||
|
for token in doclike:
|
||||||
|
if token.pos in [PROPN, NOUN, PRON]:
|
||||||
|
left, right = get_bounds(doc, token)
|
||||||
|
if left.i <= prev_right:
|
||||||
|
continue
|
||||||
|
yield left.i, right.i + 1, np_label
|
||||||
|
prev_right = right.i
|
||||||
|
|
||||||
|
|
||||||
|
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
|
@ -12,65 +12,15 @@ _exc = {
|
||||||
"uobiscum": [{ORTH: "uobis"}, {ORTH: "cum"}],
|
"uobiscum": [{ORTH: "uobis"}, {ORTH: "cum"}],
|
||||||
}
|
}
|
||||||
|
|
||||||
for orth in [
|
_abbrev_exc = """A. A.D. Aa. Aaa. Acc. Agr. Ap. Apr. April. A.U.C. Aug. C. Caes. Caess. Cc. Cn. Coll. Cons. Conss. Cos. Coss. D. D.N. Dat. Dd. Dec. Decemb. Decembr. F. Feb. Febr. Februar. Ian. Id. Imp. Impp. Imppp. Iul. Iun. K. Kal. L. M'. M. Mai. Mam. Mar. Mart. Med. N. Nn. Nob. Non. Nov. Novemb. Oct. Octob. Opet. Ord. P. Paul. Pf. Pl. Plur. Post. Pp. Prid. Pro. Procos. Q. Quint. S. S.C. Scr. Sept. Septemb. Ser. Sert. Sex. Sext. St. Sta. Suff. T. Ti. Trib. V. Vol. Vop. Vv.""".split()
|
||||||
"A.",
|
|
||||||
"Agr.",
|
_abbrev_exc += [item.lower() for item in _abbrev_exc]
|
||||||
"Ap.",
|
_abbrev_exc += [item.upper() for item in _abbrev_exc]
|
||||||
"C.",
|
_abbrev_exc += [item.replace("v", "u").replace("V", "U") for item in _abbrev_exc]
|
||||||
"Cn.",
|
|
||||||
"D.",
|
_abbrev_exc += ["d.N."]
|
||||||
"F.",
|
|
||||||
"K.",
|
for orth in set(_abbrev_exc):
|
||||||
"L.",
|
|
||||||
"M'.",
|
|
||||||
"M.",
|
|
||||||
"Mam.",
|
|
||||||
"N.",
|
|
||||||
"Oct.",
|
|
||||||
"Opet.",
|
|
||||||
"P.",
|
|
||||||
"Paul.",
|
|
||||||
"Post.",
|
|
||||||
"Pro.",
|
|
||||||
"Q.",
|
|
||||||
"S.",
|
|
||||||
"Ser.",
|
|
||||||
"Sert.",
|
|
||||||
"Sex.",
|
|
||||||
"St.",
|
|
||||||
"Sta.",
|
|
||||||
"T.",
|
|
||||||
"Ti.",
|
|
||||||
"V.",
|
|
||||||
"Vol.",
|
|
||||||
"Vop.",
|
|
||||||
"U.",
|
|
||||||
"Uol.",
|
|
||||||
"Uop.",
|
|
||||||
"Ian.",
|
|
||||||
"Febr.",
|
|
||||||
"Mart.",
|
|
||||||
"Apr.",
|
|
||||||
"Mai.",
|
|
||||||
"Iun.",
|
|
||||||
"Iul.",
|
|
||||||
"Aug.",
|
|
||||||
"Sept.",
|
|
||||||
"Oct.",
|
|
||||||
"Nov.",
|
|
||||||
"Nou.",
|
|
||||||
"Dec.",
|
|
||||||
"Non.",
|
|
||||||
"Id.",
|
|
||||||
"A.D.",
|
|
||||||
"Coll.",
|
|
||||||
"Cos.",
|
|
||||||
"Ord.",
|
|
||||||
"Pl.",
|
|
||||||
"S.C.",
|
|
||||||
"Suff.",
|
|
||||||
"Trib.",
|
|
||||||
]:
|
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
|
|
52
spacy/tests/lang/la/test_noun_chunks.py
Normal file
52
spacy/tests/lang/la/test_noun_chunks.py
Normal file
|
@ -0,0 +1,52 @@
|
||||||
|
import pytest
|
||||||
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
|
def test_noun_chunks_is_parsed(la_tokenizer):
|
||||||
|
"""Test that noun_chunks raises Value Error for 'la' language if Doc is not parsed.
|
||||||
|
To check this test, we're constructing a Doc
|
||||||
|
with a new Vocab here and forcing is_parsed to 'False'
|
||||||
|
to make sure the noun chunks don't run.
|
||||||
|
"""
|
||||||
|
doc = la_tokenizer("Haec est sententia.")
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
list(doc.noun_chunks)
|
||||||
|
|
||||||
|
|
||||||
|
LA_NP_TEST_EXAMPLES = [
|
||||||
|
(
|
||||||
|
"Haec narrantur a poetis de Perseo.",
|
||||||
|
["DET", "VERB", "ADP", "NOUN", "ADP", "PROPN", "PUNCT"],
|
||||||
|
["nsubj:pass", "ROOT", "case", "obl", "case", "obl", "punct"],
|
||||||
|
[1, 0, -1, -1, -3, -1, -5],
|
||||||
|
["poetis", "Perseo"],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Perseus autem in sinu matris dormiebat.",
|
||||||
|
["NOUN", "ADV", "ADP", "NOUN", "NOUN", "VERB", "PUNCT"],
|
||||||
|
["nsubj", "discourse", "case", "obl", "nmod", "ROOT", "punct"],
|
||||||
|
[5, 4, 3, -1, -1, 0, -1],
|
||||||
|
["Perseus", "sinu matris"],
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"text,pos,deps,heads,expected_noun_chunks", LA_NP_TEST_EXAMPLES
|
||||||
|
)
|
||||||
|
def test_la_noun_chunks(la_tokenizer, text, pos, deps, heads, expected_noun_chunks):
|
||||||
|
tokens = la_tokenizer(text)
|
||||||
|
|
||||||
|
assert len(heads) == len(pos)
|
||||||
|
doc = Doc(
|
||||||
|
tokens.vocab,
|
||||||
|
words=[t.text for t in tokens],
|
||||||
|
heads=[head + i for i, head in enumerate(heads)],
|
||||||
|
deps=deps,
|
||||||
|
pos=pos,
|
||||||
|
)
|
||||||
|
|
||||||
|
noun_chunks = list(doc.noun_chunks)
|
||||||
|
assert len(noun_chunks) == len(expected_noun_chunks)
|
||||||
|
for i, np in enumerate(noun_chunks):
|
||||||
|
assert np.text == expected_noun_chunks[i]
|
Loading…
Reference in New Issue
Block a user