Tidy up and auto-format

This commit is contained in:
Ines Montani 2020-03-25 12:28:12 +01:00
parent b71dd44dbc
commit 828acffc12
32 changed files with 1828 additions and 1793 deletions

View File

@ -225,7 +225,9 @@ def train(
exits=1, exits=1,
) )
msg.text("Extending component from base model '{}'".format(pipe)) msg.text("Extending component from base model '{}'".format(pipe))
disabled_pipes = nlp.disable_pipes([p for p in nlp.pipe_names if p not in pipeline]) disabled_pipes = nlp.disable_pipes(
[p for p in nlp.pipe_names if p not in pipeline]
)
else: else:
msg.text("Starting with blank model '{}'".format(lang)) msg.text("Starting with blank model '{}'".format(lang))
lang_cls = util.get_lang_class(lang) lang_cls = util.get_lang_class(lang)
@ -415,10 +417,10 @@ def train(
losses=losses, losses=losses,
) )
except ValueError as e: except ValueError as e:
msg.warn("Error during training") err = "Error during training"
if init_tok2vec: if init_tok2vec:
msg.warn("Did you provide the same parameters during 'train' as during 'pretrain'?") err += " Did you provide the same parameters during 'train' as during 'pretrain'?"
msg.fail("Original error message: {}".format(e), exits=1) msg.fail(err, "Original error message: {}".format(e), exits=1)
if raw_text: if raw_text:
# If raw text is available, perform 'rehearsal' updates, # If raw text is available, perform 'rehearsal' updates,
# which use unlabelled data to reduce overfitting. # which use unlabelled data to reduce overfitting.
@ -546,7 +548,10 @@ def train(
) )
break break
except Exception as e: except Exception as e:
msg.warn("Aborting and saving the final best model. Encountered exception: {}".format(e)) msg.warn(
"Aborting and saving the final best model. "
"Encountered exception: {}".format(e)
)
finally: finally:
best_pipes = nlp.pipe_names best_pipes = nlp.pipe_names
if disabled_pipes: if disabled_pipes:
@ -563,13 +568,20 @@ def train(
final_meta["speed"].setdefault("gpu", None) final_meta["speed"].setdefault("gpu", None)
# combine cpu and gpu speeds with the base model speeds # combine cpu and gpu speeds with the base model speeds
if final_meta["speed"]["cpu"] and meta["speed"]["cpu"]: if final_meta["speed"]["cpu"] and meta["speed"]["cpu"]:
speed = _get_total_speed([final_meta["speed"]["cpu"], meta["speed"]["cpu"]]) speed = _get_total_speed(
[final_meta["speed"]["cpu"], meta["speed"]["cpu"]]
)
final_meta["speed"]["cpu"] = speed final_meta["speed"]["cpu"] = speed
if final_meta["speed"]["gpu"] and meta["speed"]["gpu"]: if final_meta["speed"]["gpu"] and meta["speed"]["gpu"]:
speed = _get_total_speed([final_meta["speed"]["gpu"], meta["speed"]["gpu"]]) speed = _get_total_speed(
[final_meta["speed"]["gpu"], meta["speed"]["gpu"]]
)
final_meta["speed"]["gpu"] = speed final_meta["speed"]["gpu"] = speed
# if there were no speeds to update, overwrite with meta # if there were no speeds to update, overwrite with meta
if final_meta["speed"]["cpu"] is None and final_meta["speed"]["gpu"] is None: if (
final_meta["speed"]["cpu"] is None
and final_meta["speed"]["gpu"] is None
):
final_meta["speed"].update(meta["speed"]) final_meta["speed"].update(meta["speed"])
# note: beam speeds are not combined with the base model # note: beam speeds are not combined with the base model
if has_beam_widths: if has_beam_widths:

View File

@ -146,9 +146,14 @@ def parse_deps(orig_doc, options={}):
retokenizer.merge(span, attrs=attrs) retokenizer.merge(span, attrs=attrs)
fine_grained = options.get("fine_grained") fine_grained = options.get("fine_grained")
add_lemma = options.get("add_lemma") add_lemma = options.get("add_lemma")
words = [{"text": w.text, words = [
{
"text": w.text,
"tag": w.tag_ if fine_grained else w.pos_, "tag": w.tag_ if fine_grained else w.pos_,
"lemma": w.lemma_ if add_lemma else None} for w in doc] "lemma": w.lemma_ if add_lemma else None,
}
for w in doc
]
arcs = [] arcs = []
for word in doc: for word in doc:

View File

@ -3,7 +3,13 @@ from __future__ import unicode_literals
import uuid import uuid
from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_WORDS_LEMMA, TPL_DEP_ARCS, TPL_ENTS from .templates import (
TPL_DEP_SVG,
TPL_DEP_WORDS,
TPL_DEP_WORDS_LEMMA,
TPL_DEP_ARCS,
TPL_ENTS,
)
from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE
from ..util import minify_html, escape_html, registry from ..util import minify_html, escape_html, registry
from ..errors import Errors from ..errors import Errors
@ -83,7 +89,10 @@ class DependencyRenderer(object):
self.width = self.offset_x + len(words) * self.distance self.width = self.offset_x + len(words) * self.distance
self.height = self.offset_y + 3 * self.word_spacing self.height = self.offset_y + 3 * self.word_spacing
self.id = render_id self.id = render_id
words = [self.render_word(w["text"], w["tag"], w.get("lemma", None), i) for i, w in enumerate(words)] words = [
self.render_word(w["text"], w["tag"], w.get("lemma", None), i)
for i, w in enumerate(words)
]
arcs = [ arcs = [
self.render_arrow(a["label"], a["start"], a["end"], a["dir"], i) self.render_arrow(a["label"], a["start"], a["end"], a["dir"], i)
for i, a in enumerate(arcs) for i, a in enumerate(arcs)
@ -101,7 +110,9 @@ class DependencyRenderer(object):
lang=self.lang, lang=self.lang,
) )
def render_word(self, text, tag, lemma, i,): def render_word(
self, text, tag, lemma, i,
):
"""Render individual word. """Render individual word.
text (unicode): Word text. text (unicode): Word text.
@ -115,7 +126,9 @@ class DependencyRenderer(object):
x = self.width - x x = self.width - x
html_text = escape_html(text) html_text = escape_html(text)
if lemma is not None: if lemma is not None:
return TPL_DEP_WORDS_LEMMA.format(text=html_text, tag=tag, lemma=lemma, x=x, y=y) return TPL_DEP_WORDS_LEMMA.format(
text=html_text, tag=tag, lemma=lemma, x=x, y=y
)
return TPL_DEP_WORDS.format(text=html_text, tag=tag, x=x, y=y) return TPL_DEP_WORDS.format(text=html_text, tag=tag, x=x, y=y)
def render_arrow(self, label, start, end, direction, i): def render_arrow(self, label, start, end, direction, i):

View File

@ -112,7 +112,6 @@ class Warnings(object):
"in problems with the vocab further on in the pipeline.") "in problems with the vocab further on in the pipeline.")
@add_codes @add_codes
class Errors(object): class Errors(object):
E001 = ("No component '{name}' found in pipeline. Available names: {opts}") E001 = ("No component '{name}' found in pipeline. Available names: {opts}")

View File

@ -2,7 +2,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES
from ..char_classes import LIST_CURRENCY, CURRENCY, UNITS, PUNCT from ..char_classes import CURRENCY, UNITS, PUNCT
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES

View File

@ -10,5 +10,5 @@ Example sentences to test spaCy and its language models.
sentences = [ sentences = [
"bilbon ko castinga egin da eta nik jakin ez zuetako inork egin al du edota parte hartu duen ezagunik ba al du", "bilbon ko castinga egin da eta nik jakin ez zuetako inork egin al du edota parte hartu duen ezagunik ba al du",
"gaur telebistan entzunda denok martetik gatoz hortaz martzianoak gara beno nire ustez batzuk beste batzuk baino martzianoagoak dira" "gaur telebistan entzunda denok martetik gatoz hortaz martzianoak gara beno nire ustez batzuk beste batzuk baino martzianoagoak dira",
] ]

View File

@ -59,7 +59,6 @@ behin
""".split() """.split()
def like_num(text): def like_num(text):
if text.startswith(("+", "-", "±", "~")): if text.startswith(("+", "-", "±", "~")):
text = text[1:] text = text[1:]

View File

@ -5,7 +5,7 @@ from __future__ import unicode_literals
# https://www.ranks.nl/stopwords/basque # https://www.ranks.nl/stopwords/basque
# https://www.mustgo.com/worldlanguages/basque/ # https://www.mustgo.com/worldlanguages/basque/
STOP_WORDS = set( STOP_WORDS = set(
""" """
al al
anitz anitz
arabera arabera

View File

@ -16,7 +16,9 @@ _hyphen_suffixes += " " + _hyphen_suffixes.upper()
_prefixes = TOKENIZER_PREFIXES + [ _prefixes = TOKENIZER_PREFIXES + [
r"(?:({pe})[{el}])(?=[{a}])".format(a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)) r"(?:({pe})[{el}])(?=[{a}])".format(
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
)
] ]
_suffixes = ( _suffixes = (
@ -33,7 +35,9 @@ _suffixes = (
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
), ),
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
r"(?<=[{a}])[{h}]({hs})".format(a=ALPHA, h=HYPHENS, hs=merge_chars(_hyphen_suffixes)), r"(?<=[{a}])[{h}]({hs})".format(
a=ALPHA, h=HYPHENS, hs=merge_chars(_hyphen_suffixes)
),
] ]
) )

View File

@ -6,10 +6,10 @@ import re
from .punctuation import ELISION, HYPHENS from .punctuation import ELISION, HYPHENS
from ..tokenizer_exceptions import URL_PATTERN from ..tokenizer_exceptions import URL_PATTERN
from ..char_classes import ALPHA_LOWER, ALPHA from ..char_classes import ALPHA_LOWER, ALPHA
from ...symbols import ORTH, LEMMA, TAG from ...symbols import ORTH, LEMMA
# not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer # not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer
#from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS # from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
FR_BASE_EXCEPTIONS = ["aujourd'hui", "Aujourd'hui"] FR_BASE_EXCEPTIONS = ["aujourd'hui", "Aujourd'hui"]
@ -93,7 +93,7 @@ for verb, verb_lemma in [
for pronoun in ["elle", "il", "on"]: for pronoun in ["elle", "il", "on"]:
token = "{}-t-{}".format(orth, pronoun) token = "{}-t-{}".format(orth, pronoun)
_exc[token] = [ _exc[token] = [
{LEMMA: verb_lemma, ORTH: orth}, #, TAG: "VERB"}, {LEMMA: verb_lemma, ORTH: orth}, # , TAG: "VERB"},
{LEMMA: "t", ORTH: "-t"}, {LEMMA: "t", ORTH: "-t"},
{LEMMA: pronoun, ORTH: "-" + pronoun}, {LEMMA: pronoun, ORTH: "-" + pronoun},
] ]
@ -102,7 +102,7 @@ for verb, verb_lemma in [("est", "être")]:
for orth in [verb, verb.title()]: for orth in [verb, verb.title()]:
token = "{}-ce".format(orth) token = "{}-ce".format(orth)
_exc[token] = [ _exc[token] = [
{LEMMA: verb_lemma, ORTH: orth}, #, TAG: "VERB"}, {LEMMA: verb_lemma, ORTH: orth}, # , TAG: "VERB"},
{LEMMA: "ce", ORTH: "-ce"}, {LEMMA: "ce", ORTH: "-ce"},
] ]

View File

@ -1,7 +1,7 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
from ..char_classes import LIST_ELLIPSES, LIST_ICONS from ..char_classes import LIST_ELLIPSES, LIST_ICONS
from ..char_classes import ALPHA, HYPHENS, CONCAT_QUOTES from ..char_classes import ALPHA, HYPHENS, CONCAT_QUOTES
from ..char_classes import ALPHA_LOWER, ALPHA_UPPER from ..char_classes import ALPHA_LOWER, ALPHA_UPPER
@ -10,14 +10,7 @@ from ..char_classes import ALPHA_LOWER, ALPHA_UPPER
ELISION = "'" ELISION = "'"
_prefixes = ( _prefixes = [r"'[0-9][0-9]", r"[0-9]+°"] + BASE_TOKENIZER_PREFIXES
[
r"'[0-9][0-9]",
r"[0-9]+°",
]
+ TOKENIZER_PREFIXES
)
_infixes = ( _infixes = (
@ -31,7 +24,7 @@ _infixes = (
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])(?:{h})(?=[{al}])".format(a=ALPHA, h=HYPHENS, al=ALPHA_LOWER), r"(?<=[{a}])(?:{h})(?=[{al}])".format(a=ALPHA, h=HYPHENS, al=ALPHA_LOWER),
r"(?<=[{a}0-9])[:<>=\/](?=[{a}])".format(a=ALPHA), r"(?<=[{a}0-9])[:<>=\/](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}][{el}])(?=[{a}0-9\"])".format(a=ALPHA, el=ELISION) r"(?<=[{a}][{el}])(?=[{a}0-9\"])".format(a=ALPHA, el=ELISION),
] ]
) )

View File

@ -10,7 +10,7 @@ _exc = {
"l'art.": [{ORTH: "l'"}, {ORTH: "art."}], "l'art.": [{ORTH: "l'"}, {ORTH: "art."}],
"nell'art.": [{ORTH: "nell'"}, {ORTH: "art."}], "nell'art.": [{ORTH: "nell'"}, {ORTH: "art."}],
"po'": [{ORTH: "po'", LEMMA: "poco"}], "po'": [{ORTH: "po'", LEMMA: "poco"}],
"sett..": [{ORTH: "sett."}, {ORTH: "."}] "sett..": [{ORTH: "sett."}, {ORTH: "."}],
} }
for orth in [ for orth in [
@ -32,7 +32,7 @@ for orth in [
"col.", "col.",
"Cost.", "Cost.",
"d.C.", "d.C.",
'de"' 'de"',
"distr.", "distr.",
"E'", "E'",
"ecc.", "ecc.",

View File

@ -44,7 +44,7 @@ for prep, prep_lemma in [
("s", "se"), ("s", "se"),
]: ]:
for prefix_orth in [prefix, prefix.capitalize()]: for prefix_orth in [prefix, prefix.capitalize()]:
_exc[prefix_orth+prep] = [ _exc[prefix_orth + prep] = [
{ORTH: prefix_orth, LEMMA: prefix_lemma}, {ORTH: prefix_orth, LEMMA: prefix_lemma},
{ORTH: prep, LEMMA: prep_lemma}, {ORTH: prep, LEMMA: prep_lemma},
] ]

View File

@ -29,7 +29,9 @@ class LithuanianDefaults(Language.Defaults):
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
mod_base_exceptions = {exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")} mod_base_exceptions = {
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
}
del mod_base_exceptions["8)"] del mod_base_exceptions["8)"]
tokenizer_exceptions = update_exc(mod_base_exceptions, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(mod_base_exceptions, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -8,262 +8,262 @@ _exc = {}
for orth in [ for orth in [
"n-tosios", "n-tosios",
"?!", "?!",
# "G.", # "G.",
# "J. E.", # "J. E.",
# "J. Em.", # "J. Em.",
# "J.E.", # "J.E.",
# "J.Em.", # "J.Em.",
# "K.", # "K.",
# "N.", # "N.",
# "V.", # "V.",
# "Vt.", # "Vt.",
# "a.", # "a.",
# "a.k.", # "a.k.",
# "a.s.", # "a.s.",
# "adv.", # "adv.",
# "akad.", # "akad.",
# "aklg.", # "aklg.",
# "akt.", # "akt.",
# "al.", # "al.",
# "ang.", # "ang.",
# "angl.", # "angl.",
# "aps.", # "aps.",
# "apskr.", # "apskr.",
# "apyg.", # "apyg.",
# "arbat.", # "arbat.",
# "asist.", # "asist.",
# "asm.", # "asm.",
# "asm.k.", # "asm.k.",
# "asmv.", # "asmv.",
# "atk.", # "atk.",
# "atsak.", # "atsak.",
# "atsisk.", # "atsisk.",
# "atsisk.sąsk.", # "atsisk.sąsk.",
# "atv.", # "atv.",
# "aut.", # "aut.",
# "avd.", # "avd.",
# "b.k.", # "b.k.",
# "baud.", # "baud.",
# "biol.", # "biol.",
# "bkl.", # "bkl.",
# "bot.", # "bot.",
# "bt.", # "bt.",
# "buv.", # "buv.",
# "ch.", # "ch.",
# "chem.", # "chem.",
# "corp.", # "corp.",
# "d.", # "d.",
# "dab.", # "dab.",
# "dail.", # "dail.",
# "dek.", # "dek.",
# "deš.", # "deš.",
# "dir.", # "dir.",
# "dirig.", # "dirig.",
# "doc.", # "doc.",
# "dol.", # "dol.",
# "dr.", # "dr.",
# "drp.", # "drp.",
# "dvit.", # "dvit.",
# "dėst.", # "dėst.",
# "dš.", # "dš.",
# "dž.", # "dž.",
# "e.b.", # "e.b.",
# "e.bankas", # "e.bankas",
# "e.p.", # "e.p.",
# "e.parašas", # "e.parašas",
# "e.paštas", # "e.paštas",
# "e.v.", # "e.v.",
# "e.valdžia", # "e.valdžia",
# "egz.", # "egz.",
# "eil.", # "eil.",
# "ekon.", # "ekon.",
# "el.", # "el.",
# "el.bankas", # "el.bankas",
# "el.p.", # "el.p.",
# "el.parašas", # "el.parašas",
# "el.paštas", # "el.paštas",
# "el.valdžia", # "el.valdžia",
# "etc.", # "etc.",
# "ež.", # "ež.",
# "fak.", # "fak.",
# "faks.", # "faks.",
# "feat.", # "feat.",
# "filol.", # "filol.",
# "filos.", # "filos.",
# "g.", # "g.",
# "gen.", # "gen.",
# "geol.", # "geol.",
# "gerb.", # "gerb.",
# "gim.", # "gim.",
# "gr.", # "gr.",
# "gv.", # "gv.",
# "gyd.", # "gyd.",
# "gyv.", # "gyv.",
# "habil.", # "habil.",
# "inc.", # "inc.",
# "insp.", # "insp.",
# "inž.", # "inž.",
# "ir pan.", # "ir pan.",
# "ir t. t.", # "ir t. t.",
# "isp.", # "isp.",
# "istor.", # "istor.",
# "it.", # "it.",
# "just.", # "just.",
# "k.", # "k.",
# "k. a.", # "k. a.",
# "k.a.", # "k.a.",
# "kab.", # "kab.",
# "kand.", # "kand.",
# "kart.", # "kart.",
# "kat.", # "kat.",
# "ketv.", # "ketv.",
# "kh.", # "kh.",
# "kl.", # "kl.",
# "kln.", # "kln.",
# "km.", # "km.",
# "kn.", # "kn.",
# "koresp.", # "koresp.",
# "kpt.", # "kpt.",
# "kr.", # "kr.",
# "kt.", # "kt.",
# "kub.", # "kub.",
# "kun.", # "kun.",
# "kv.", # "kv.",
# "kyš.", # "kyš.",
# "l. e. p.", # "l. e. p.",
# "l.e.p.", # "l.e.p.",
# "lenk.", # "lenk.",
# "liet.", # "liet.",
# "lot.", # "lot.",
# "lt.", # "lt.",
# "ltd.", # "ltd.",
# "ltn.", # "ltn.",
# "m.", # "m.",
# "m.e..", # "m.e..",
# "m.m.", # "m.m.",
# "mat.", # "mat.",
# "med.", # "med.",
# "mgnt.", # "mgnt.",
# "mgr.", # "mgr.",
# "min.", # "min.",
# "mjr.", # "mjr.",
# "ml.", # "ml.",
# "mln.", # "mln.",
# "mlrd.", # "mlrd.",
# "mob.", # "mob.",
# "mok.", # "mok.",
# "moksl.", # "moksl.",
# "mokyt.", # "mokyt.",
# "mot.", # "mot.",
# "mr.", # "mr.",
# "mst.", # "mst.",
# "mstl.", # "mstl.",
# "mėn.", # "mėn.",
# "nkt.", # "nkt.",
# "no.", # "no.",
# "nr.", # "nr.",
# "ntk.", # "ntk.",
# "nuotr.", # "nuotr.",
# "op.", # "op.",
# "org.", # "org.",
# "orig.", # "orig.",
# "p.", # "p.",
# "p.d.", # "p.d.",
# "p.m.e.", # "p.m.e.",
# "p.s.", # "p.s.",
# "pab.", # "pab.",
# "pan.", # "pan.",
# "past.", # "past.",
# "pav.", # "pav.",
# "pavad.", # "pavad.",
# "per.", # "per.",
# "perd.", # "perd.",
# "pirm.", # "pirm.",
# "pl.", # "pl.",
# "plg.", # "plg.",
# "plk.", # "plk.",
# "pr.", # "pr.",
# "pr.Kr.", # "pr.Kr.",
# "pranc.", # "pranc.",
# "proc.", # "proc.",
# "prof.", # "prof.",
# "prom.", # "prom.",
# "prot.", # "prot.",
# "psl.", # "psl.",
# "pss.", # "pss.",
# "pvz.", # "pvz.",
# "pšt.", # "pšt.",
# "r.", # "r.",
# "raj.", # "raj.",
# "red.", # "red.",
# "rez.", # "rez.",
# "rež.", # "rež.",
# "rus.", # "rus.",
# "rš.", # "rš.",
# "s.", # "s.",
# "sav.", # "sav.",
# "saviv.", # "saviv.",
# "sek.", # "sek.",
# "sekr.", # "sekr.",
# "sen.", # "sen.",
# "sh.", # "sh.",
# "sk.", # "sk.",
# "skg.", # "skg.",
# "skv.", # "skv.",
# "skyr.", # "skyr.",
# "sp.", # "sp.",
# "spec.", # "spec.",
# "sr.", # "sr.",
# "st.", # "st.",
# "str.", # "str.",
# "stud.", # "stud.",
# "sąs.", # "sąs.",
# "t.", # "t.",
# "t. p.", # "t. p.",
# "t. y.", # "t. y.",
# "t.p.", # "t.p.",
# "t.t.", # "t.t.",
# "t.y.", # "t.y.",
# "techn.", # "techn.",
# "tel.", # "tel.",
# "teol.", # "teol.",
# "th.", # "th.",
# "tir.", # "tir.",
# "trit.", # "trit.",
# "trln.", # "trln.",
# "tšk.", # "tšk.",
# "tūks.", # "tūks.",
# "tūkst.", # "tūkst.",
# "up.", # "up.",
# "upl.", # "upl.",
# "v.s.", # "v.s.",
# "vad.", # "vad.",
# "val.", # "val.",
# "valg.", # "valg.",
# "ved.", # "ved.",
# "vert.", # "vert.",
# "vet.", # "vet.",
# "vid.", # "vid.",
# "virš.", # "virš.",
# "vlsč.", # "vlsč.",
# "vnt.", # "vnt.",
# "vok.", # "vok.",
# "vs.", # "vs.",
# "vtv.", # "vtv.",
# "vv.", # "vv.",
# "vyr.", # "vyr.",
# "vyresn.", # "vyresn.",
# "zool.", # "zool.",
# "Įn", # "Įn",
# "įl.", # "įl.",
# "š.m.", # "š.m.",
# "šnek.", # "šnek.",
# "šv.", # "šv.",
# "švč.", # "švč.",
# "ž.ū.", # "ž.ū.",
# "žin.", # "žin.",
# "žml.", # "žml.",
# "žr.", # "žr.",
]: ]:
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]

View File

@ -24,7 +24,6 @@ _prefixes = (
) )
_infixes = ( _infixes = (
LIST_ELLIPSES LIST_ELLIPSES
+ _list_icons + _list_icons

View File

@ -1,7 +1,7 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ...symbols import ORTH, NORM from ...symbols import ORTH
_exc = {} _exc = {}

View File

@ -1,8 +1,8 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ...symbols import POS, AUX, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import POS, AUX, ADJ, CCONJ, NUM, ADV, ADP, X, VERB
from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON from ...symbols import NOUN, PART, INTJ, PRON
# Source https://universaldependencies.org/tagset-conversion/sk-snk-uposf.html # Source https://universaldependencies.org/tagset-conversion/sk-snk-uposf.html
# fmt: off # fmt: off

View File

@ -37,7 +37,7 @@ URL_PATTERN = (
r"|" r"|"
# host & domain names # host & domain names
# mods: match is case-sensitive, so include [A-Z] # mods: match is case-sensitive, so include [A-Z]
"(?:" "(?:" # noqa
"(?:" "(?:"
"[A-Za-z0-9\u00a1-\uffff]" "[A-Za-z0-9\u00a1-\uffff]"
"[A-Za-z0-9\u00a1-\uffff_-]{0,62}" "[A-Za-z0-9\u00a1-\uffff_-]{0,62}"

View File

@ -612,7 +612,7 @@ class Language(object):
link_vectors_to_models(self.vocab) link_vectors_to_models(self.vocab)
if self.vocab.vectors.data.shape[1]: if self.vocab.vectors.data.shape[1]:
cfg["pretrained_vectors"] = self.vocab.vectors.name cfg["pretrained_vectors"] = self.vocab.vectors.name
cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1] cfg["pretrained_dims"] = self.vocab.vectors.data.shape[1]
if sgd is None: if sgd is None:
sgd = create_default_optimizer(Model.ops) sgd = create_default_optimizer(Model.ops)
self._optimizer = sgd self._optimizer = sgd
@ -857,7 +857,14 @@ class Language(object):
procs = [ procs = [
mp.Process( mp.Process(
target=_apply_pipes, target=_apply_pipes,
args=(self.make_doc, pipes, rch, sch, Underscore.get_state(), load_nlp.VECTORS), args=(
self.make_doc,
pipes,
rch,
sch,
Underscore.get_state(),
load_nlp.VECTORS,
),
) )
for rch, sch in zip(texts_q, bytedocs_send_ch) for rch, sch in zip(texts_q, bytedocs_send_ch)
] ]

View File

@ -222,11 +222,9 @@ class EntityRuler(object):
for label, pattern, ent_id in zip( for label, pattern, ent_id in zip(
phrase_pattern_labels, phrase_pattern_labels,
self.nlp.pipe(phrase_pattern_texts), self.nlp.pipe(phrase_pattern_texts),
phrase_pattern_ids phrase_pattern_ids,
): ):
phrase_pattern = { phrase_pattern = {"label": label, "pattern": pattern, "id": ent_id}
"label": label, "pattern": pattern, "id": ent_id
}
if ent_id: if ent_id:
phrase_pattern["id"] = ent_id phrase_pattern["id"] = ent_id
phrase_patterns.append(phrase_pattern) phrase_patterns.append(phrase_pattern)

View File

@ -71,9 +71,7 @@ def test_doc_array_to_from_string_attrs(en_vocab, attrs):
def test_doc_array_idx(en_vocab): def test_doc_array_idx(en_vocab):
"""Test that Doc.to_array can retrieve token start indices""" """Test that Doc.to_array can retrieve token start indices"""
words = ["An", "example", "sentence"] words = ["An", "example", "sentence"]
doc = Doc(en_vocab, words=words)
offsets = Doc(en_vocab, words=words).to_array("IDX") offsets = Doc(en_vocab, words=words).to_array("IDX")
assert offsets[0] == 0 assert offsets[0] == 0
assert offsets[1] == 3 assert offsets[1] == 3
assert offsets[2] == 11 assert offsets[2] == 11

View File

@ -59,7 +59,7 @@ def test_da_tokenizer_norm_exceptions(da_tokenizer, text, norm):
("Sprogteknologi a/s", 2), ("Sprogteknologi a/s", 2),
("De boede i A/B Bellevue", 5), ("De boede i A/B Bellevue", 5),
# note: skipping due to weirdness in UD_Danish-DDT # note: skipping due to weirdness in UD_Danish-DDT
#("Rotorhastigheden er 3400 o/m.", 5), # ("Rotorhastigheden er 3400 o/m.", 5),
("Jeg købte billet t/r.", 5), ("Jeg købte billet t/r.", 5),
("Murerarbejdsmand m/k søges", 3), ("Murerarbejdsmand m/k søges", 3),
("Netværket kører over TCP/IP", 4), ("Netværket kører over TCP/IP", 4),

View File

@ -10,7 +10,13 @@ def test_eu_tokenizer_handles_long_text(eu_tokenizer):
assert len(tokens) == 5 assert len(tokens) == 5
@pytest.mark.parametrize("text,length", [("milesker ederra joan zen hitzaldia plazer hutsa", 7), ("astelehen guztia sofan pasau biot", 5)]) @pytest.mark.parametrize(
"text,length",
[
("milesker ederra joan zen hitzaldia plazer hutsa", 7),
("astelehen guztia sofan pasau biot", 5),
],
)
def test_eu_tokenizer_handles_cnts(eu_tokenizer, text, length): def test_eu_tokenizer_handles_cnts(eu_tokenizer, text, length):
tokens = eu_tokenizer(text) tokens = eu_tokenizer(text)
assert len(tokens) == length assert len(tokens) == length

View File

@ -297,12 +297,7 @@ WIKI_TESTS = [
] ]
EXTRA_TESTS = ( EXTRA_TESTS = (
DOT_TESTS DOT_TESTS + QUOTE_TESTS + NUMBER_TESTS + HYPHEN_TESTS + WIKI_TESTS + TYPO_TESTS
+ QUOTE_TESTS
+ NUMBER_TESTS
+ HYPHEN_TESTS
+ WIKI_TESTS
+ TYPO_TESTS
) )
# normal: default tests + 10% of extra tests # normal: default tests + 10% of extra tests
@ -311,7 +306,14 @@ TESTS.extend([x for i, x in enumerate(EXTRA_TESTS) if i % 10 == 0])
# slow: remaining 90% of extra tests # slow: remaining 90% of extra tests
SLOW_TESTS = [x for i, x in enumerate(EXTRA_TESTS) if i % 10 != 0] SLOW_TESTS = [x for i, x in enumerate(EXTRA_TESTS) if i % 10 != 0]
TESTS.extend([pytest.param(x[0], x[1], marks=pytest.mark.slow()) if not isinstance(x[0], tuple) else x for x in SLOW_TESTS]) TESTS.extend(
[
pytest.param(x[0], x[1], marks=pytest.mark.slow())
if not isinstance(x[0], tuple)
else x
for x in SLOW_TESTS
]
)
@pytest.mark.parametrize("text,expected_tokens", TESTS) @pytest.mark.parametrize("text,expected_tokens", TESTS)

View File

@ -6,7 +6,8 @@ import re
from mock import Mock from mock import Mock
from spacy.matcher import Matcher, DependencyMatcher from spacy.matcher import Matcher, DependencyMatcher
from spacy.tokens import Doc, Token from spacy.tokens import Doc, Token
from ..doc.test_underscore import clean_underscore
from ..doc.test_underscore import clean_underscore # noqa: F401
@pytest.fixture @pytest.fixture

View File

@ -152,10 +152,5 @@ def test_entity_ruler_validate(nlp):
def test_entity_ruler_properties(nlp, patterns): def test_entity_ruler_properties(nlp, patterns):
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
assert sorted(ruler.labels) == sorted([ assert sorted(ruler.labels) == sorted(["HELLO", "BYE", "COMPLEX", "TECH_ORG"])
"HELLO",
"BYE",
"COMPLEX",
"TECH_ORG"
])
assert sorted(ruler.ent_ids) == ["a1", "a2"] assert sorted(ruler.ent_ids) == ["a1", "a2"]

View File

@ -23,4 +23,3 @@ def test_issue4725():
docs = ["Kurt is in London."] * 10 docs = ["Kurt is in London."] * 10
for _ in nlp.pipe(docs, batch_size=2, n_process=2): for _ in nlp.pipe(docs, batch_size=2, n_process=2):
pass pass

View File

@ -9,11 +9,12 @@ def test_issue4849():
nlp = English() nlp = English()
ruler = EntityRuler( ruler = EntityRuler(
nlp, patterns=[ nlp,
{"label": "PERSON", "pattern": 'joe biden', "id": 'joe-biden'}, patterns=[
{"label": "PERSON", "pattern": 'bernie sanders', "id": 'bernie-sanders'}, {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
{"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
], ],
phrase_matcher_attr="LOWER" phrase_matcher_attr="LOWER",
) )
nlp.add_pipe(ruler) nlp.add_pipe(ruler)
@ -27,10 +28,10 @@ def test_issue4849():
count_ents = 0 count_ents = 0
for doc in nlp.pipe([text], n_process=1): for doc in nlp.pipe([text], n_process=1):
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
assert(count_ents == 2) assert count_ents == 2
# USING 2 PROCESSES # USING 2 PROCESSES
count_ents = 0 count_ents = 0
for doc in nlp.pipe([text], n_process=2): for doc in nlp.pipe([text], n_process=2):
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
assert (count_ents == 2) assert count_ents == 2

View File

@ -22,7 +22,7 @@ def test_serialize_custom_tokenizer(en_vocab, en_tokenizer):
tokenizer_bytes = tokenizer.to_bytes() tokenizer_bytes = tokenizer.to_bytes()
Tokenizer(en_vocab).from_bytes(tokenizer_bytes) Tokenizer(en_vocab).from_bytes(tokenizer_bytes)
tokenizer = Tokenizer(en_vocab, rules={"ABC.": [{"ORTH": "ABC", "ORTH": "."}]}) tokenizer = Tokenizer(en_vocab, rules={"ABC.": [{"ORTH": "ABC"}, {"ORTH": "."}]})
tokenizer.rules = {} tokenizer.rules = {}
tokenizer_bytes = tokenizer.to_bytes() tokenizer_bytes = tokenizer.to_bytes()
tokenizer_reloaded = Tokenizer(en_vocab).from_bytes(tokenizer_bytes) tokenizer_reloaded = Tokenizer(en_vocab).from_bytes(tokenizer_bytes)

View File

@ -28,7 +28,9 @@ def make_tempdir():
shutil.rmtree(path2str(d)) shutil.rmtree(path2str(d))
def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None): def get_doc(
vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None
):
"""Create Doc object from given vocab, words and annotations.""" """Create Doc object from given vocab, words and annotations."""
if deps and not heads: if deps and not heads:
heads = [0] * len(deps) heads = [0] * len(deps)
@ -60,7 +62,7 @@ def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=No
if attrs.ndim == 1: if attrs.ndim == 1:
attrs[i] = heads[i] attrs[i] = heads[i]
else: else:
attrs[i,j] = heads[i] attrs[i, j] = heads[i]
else: else:
for i in range(len(words)): for i in range(len(words)):
if attrs.ndim == 1: if attrs.ndim == 1: