Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-07-15 02:32:37 +03:00 · 2018-12-17 23:08:40 +00:00 · 2018-12-17 23:08:40 +00:00 · 3c4a2edf4a
commit 3c4a2edf4a
parent 95fc0176d1 e3405f8af3
6 changed files with 162 additions and 18083 deletions
--- a/examples/training/train_ner.py
+++ b/examples/training/train_ner.py
@ -56,7 +56,10 @@ def main(model=None, output_dir=None, n_iter=100):
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
-        optimizer = nlp.begin_training()
+        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
@ -68,7 +71,6 @@ def main(model=None, output_dir=None, n_iter=100):
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses,
                )
            print("Losses", losses)
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@ -323,7 +323,7 @@ for exc_data in [
 # Other contractions with leading apostrophe
 for exc_data in [
-    {ORTH: "cause", LEMMA: "because", NORM: "because"},
+    {ORTH: "cause", NORM: "because"},
    {ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"},
    {ORTH: "ll", LEMMA: "will", NORM: "will"},
    {ORTH: "nuff", LEMMA: "enough", NORM: "enough"},
--- a/spacy/lang/fr/_tokenizer_exceptions_list.py
+++ b/spacy/lang/fr/_tokenizer_exceptions_list.py
--- a/spacy/lang/fr/tokenizer_exceptions.py
+++ b/spacy/lang/fr/tokenizer_exceptions.py
@ -53,7 +53,7 @@ for exc_data in [
    _exc[exc_data[ORTH]] = [exc_data]
-for orth in FR_BASE_EXCEPTIONS + ["etc."]:
+for orth in ["etc."]:
    _exc[orth] = [{ORTH: orth}]
@ -93,14 +93,18 @@ for pre, pre_lemma in [("qu'", "que"), ("n'", "ne")]:
 _infixes_exc = []
-for elision_char in ELISION:
+orig_elision = "'"
-    for hyphen_char in ["-", "‐"]:
+orig_hyphen = '-'
-        _infixes_exc += [
+
-            infix.replace("'", elision_char).replace("-", hyphen_char)
+# loop through the elison and hyphen characters, and try to substitute the ones that weren't used in the original list
-            for infix in FR_BASE_EXCEPTIONS
+for infix in FR_BASE_EXCEPTIONS:
-        ]
+    variants_infix = {infix}
-_infixes_exc += [upper_first_letter(word) for word in _infixes_exc]
+    for elision_char in [x for x in ELISION if x != orig_elision]:
-_infixes_exc = list(set(_infixes_exc))
+        variants_infix.update([word.replace(orig_elision, elision_char) for word in variants_infix])
    for hyphen_char in [x for x in ['-', '‐'] if x != orig_hyphen]:
        variants_infix.update([word.replace(orig_hyphen, hyphen_char) for word in variants_infix])
    variants_infix.update([upper_first_letter(word) for word in variants_infix])
    _infixes_exc.extend(variants_infix)
 for orth in _infixes_exc:
    _exc[orth] = [{ORTH: orth}]
@ -111,6 +115,7 @@ _hyphen_prefix = [
    "abat",
    "a[fg]ro",
    "after",
    "aigues?",
    "am[ée]ricano",
    "anglo",
    "anti",
@ -120,24 +125,33 @@ _hyphen_prefix = [
    "archi",
    "arrières?",
    "avant",
    "avion",
    "auto",
    "banc",
    "bas(?:ses?)?",
    "bateaux?",
    "bec?",
    "belles?",
    "beau",
    "best",
    "bio?",
    "bien",
    "blanc",
    "bo[îi]te",
    "bonn?e?s?",
    "bois",
    "bou(?:c|rg)",
    "b[êe]ta",
    "cache",
    "cap(?:ello)?",
    "casse",
    "castel",
    "champ",
    "chapelle",
-    "ch[âa]teau",
+    "ch[âa]teau(?:neuf)?",
    "chasse",
    "cha(?:ud|t)e?s?",
    "chauffe",
    "chou",
    "chromo",
    "claire?s?",
@ -146,7 +160,8 @@ _hyphen_prefix = [
    "contre",
    "cordon",
    "coupe?",
-    "court",
+    "courte?s?",
    "couvre",
    "crash",
    "crise",
    "croche",
@ -155,15 +170,19 @@ _hyphen_prefix = [
    "côte",
    "demi",
    "di(?:sney)?",
    "dix",
    "d[ée]s?",
    "double",
    "dys",
    "ex?",
    "émirato",
    "entre",
    "est",
    "ethno",
    "ex",
    "extra",
    "extrême",
    "[ée]co",
    "faux",
    "fil",
    "fort",
    "franco?s?",
@ -176,6 +195,8 @@ _hyphen_prefix = [
    "gros",
    "g[ée]o",
    "haute?s?",
    "homm?es?",
    "hors",
    "hyper",
    "indo",
    "infra",
@ -190,6 +211,8 @@ _hyphen_prefix = [
    "lot",
    "louis",
    "m[ai]cro",
    "mal",
    "médio",
    "mesnil",
    "mi(?:ni)?",
    "mono",
@ -201,6 +224,7 @@ _hyphen_prefix = [
    "m[ée]do",
    "m[ée]ta",
    "mots?",
    "neuro",
    "noix",
    "non",
    "nord",
@ -213,60 +237,137 @@ _hyphen_prefix = [
    "perce",
    "pharmaco",
    "ph[oy]to",
    "pieds?",
    "pique",
    "poissons?",
    "ponce",
    "pont",
    "po[rs]t",
    "pousse",
    "primo",
    "pro(?:cès|to)?",
    "pare",
-    "petite?",
+    "petite?s?",
    "plessis",
    "porte",
    "pré",
    "prêchi",
    "protège",
    "pseudo",
    "pêle",
    "péri",
    "puy",
    "quasi",
    "quatre",
    "radio",
    "recourt",
    "rythmo",
    "(?:re)?doubles?",
    "r[ée]",
    "r[ée]tro",
-    "sans",
+    "requin",
-    "sainte?s?",
+    "sans?",
    "sa?inte?s?",
    "semi",
-    "social",
+    "serre",
    "sino",
    "socio",
    "sociale?s?",
    "soixante",
    "sous",
-    "su[bdr]",
+    "su[bdrs]",
    "super",
    "taille",
    "tire",
    "thermo",
    "tiers",
    "tourne",
    "toute?s?",
    "tra[iî]ne?",
    "trans",
    "trente",
    "trois",
    "trousse",
    "tr(?:i|ou)",
    "t[ée]l[ée]",
    "utéro",
    "vaso",
    "vi[cd]e",
    "vid[ée]o",
-    "vie(?:ux|illes?)",
+    "vie(?:ux|i?lles?|i?l)",
    "vill(?:e|eneuve|ers|ette|iers|y)",
    "vingt",
    "voitures?",
    "wagons?",
    "ultra",
    "à",
    "[ée]lectro",
    "[ée]qui",
    "Fontaine",
    "La Chapelle",
    "Marie",
    "Le Mesnil",
    "Neuville",
    "Pierre",
    "Val",
    "Vaux",
 ]
 _elision_prefix = ["entr", "grande?s?"]
 _other_hyphens = "".join([h for h in HYPHENS if h != "-"])
 _regular_exp = [
-    "^droits?[{hyphen}]de[{hyphen}]l'homm[{alpha}]+$".format(
+    "^a[{hyphen}]sexualis[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
-        hyphen=HYPHENS, alpha=ALPHA_LOWER
+    "^arginine[{hyphen}]méthyl[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
-    ),
+    "^binge[{hyphen}]watch[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^black[{hyphen}]out[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^bouche[{hyphen}]por[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^burn[{hyphen}]out[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^by[{hyphen}]pass[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^ch[{elision}]tiis[{alpha}]+$".format(elision=ELISION, alpha=ALPHA_LOWER),
    "^chape[{hyphen}]chut[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^down[{hyphen}]load[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^[ée]tats[{hyphen}]uni[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^droits?[{hyphen}]de[{hyphen}]l'homm[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^fac[{hyphen}]simil[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^fleur[{hyphen}]bleuis[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^flic[{hyphen}]flaqu[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^fox[{hyphen}]trott[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^google[{hyphen}]is[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^hard[{hyphen}]discount[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^hip[{hyphen}]hop[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^jet[{hyphen}]set[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^knock[{hyphen}]out[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^lèche[{hyphen}]bott[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^litho[{hyphen}]typographi[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^lock[{hyphen}]out[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^lombri[{hyphen}]compost[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^mac[{hyphen}]adamis[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^marque[{hyphen}]pag[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^mouton[{hyphen}]noiris[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^new[{hyphen}]york[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^pair[{hyphen}]programm[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^people[{hyphen}]is[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^plan[{hyphen}]socialis[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^premier[{hyphen}]ministr[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^prud[{elision}]hom[{alpha}]+$".format(elision=ELISION, alpha=ALPHA_LOWER),
    "^réarc[{hyphen}]bout[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^refox[{hyphen}]trott[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^remicro[{hyphen}]ond[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^repique[{hyphen}]niqu[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^repetit[{hyphen}]déjeun[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^rick[{hyphen}]roll[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^rond[{hyphen}]ponn[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^shift[{hyphen}]cliqu[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^soudo[{hyphen}]bras[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^stabilo[{hyphen}]boss[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^strip[{hyphen}]teas[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^terra[{hyphen}]form[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^teuf[{hyphen}]teuf[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^yo[{hyphen}]yo[{alpha}]+$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
    "^zig[{hyphen}]zag[{alpha}]*$".format(hyphen=HYPHENS, alpha=ALPHA_LOWER),
-    "^prud[{elision}]homm[{alpha}]*$".format(elision=ELISION, alpha=ALPHA_LOWER),
+    "^z[{elision}]yeut[{alpha}]+$".format(elision=ELISION, alpha=ALPHA_LOWER),
 ]
 # catching cases like faux-vampire
 _regular_exp += [
    "^{prefix}[{hyphen}][{alpha}][{alpha}{elision}{other_hyphen}\-]*$".format(
        prefix=p,
@ -277,12 +378,32 @@ _regular_exp += [
    )
    for p in _hyphen_prefix
 ]
 # catching cases like entr'abat
 _elision_prefix = ['r?é?entr', 'grande?s?', 'r']
 _regular_exp += [
    "^{prefix}[{elision}][{alpha}][{alpha}{elision}{hyphen}\-]*$".format(
-        prefix=p, elision=HYPHENS, hyphen=_other_hyphens, alpha=ALPHA_LOWER
+        prefix=p,
        elision=ELISION,
        hyphen=_other_hyphens,
        alpha=ALPHA_LOWER,
    )
    for p in _elision_prefix
 ]
 # catching cases like saut-de-ski, pet-en-l'air
 _hyphen_combination = ['l[èe]s?', 'la', 'en', 'des?', 'd[eu]', 'sur', 'sous', 'aux?', 'à', 'et', "près", "saint"]
 _regular_exp += [
    "^[{alpha}]+[{hyphen}]{hyphen_combo}[{hyphen}](?:l[{elision}])?[{alpha}]+$".format(
        hyphen_combo=hc,
        elision=ELISION,
        hyphen=HYPHENS,
        alpha=ALPHA_LOWER,
    )
    for hc in _hyphen_combination
 ]
 # URLs
 _regular_exp.append(URL_PATTERN)
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -342,6 +342,8 @@ def _convert_strings(token_specs, string_store):
                    keys = ', '.join(operators.keys())
                    raise KeyError(Errors.E011.format(op=value, opts=keys))
            if isinstance(attr, basestring):
                if attr.upper() == 'TEXT':
                    attr = 'ORTH'
                attr = IDS.get(attr.upper())
            if isinstance(value, basestring):
                value = string_store.add(value)
--- a/spacy/tests/lang/fr/test_exceptions.py
+++ b/spacy/tests/lang/fr/test_exceptions.py
@ -5,7 +5,15 @@ import pytest
@pytest.mark.parametrize(
-    "text", ["aujourd'hui", "Aujourd'hui", "prud'hommes", "prud’hommal"]
+    "text", ["aujourd'hui", "Aujourd'hui", "prud'hommes", "prud’hommal",
             "audio-numérique", "Audio-numérique",
             "entr'amis", "entr'abat", "rentr'ouvertes", "grand'hamien",
             "Châteauneuf-la-Forêt", "Château-Guibert",
             "11-septembre", "11-Septembre", "refox-trottâmes",
             "K-POP", "K-Pop", "K-pop", "z'yeutes",
             "black-outeront", "états-unienne",
             "courtes-pattes", "court-pattes",
             "saut-de-ski", "Écourt-Saint-Quentin", "Bout-de-l'Îlien", "pet-en-l'air"]
 )
 def test_fr_tokenizer_infix_exceptions(fr_tokenizer, text):
    tokens = fr_tokenizer(text)