mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Tidy up and auto-format
This commit is contained in:
parent
b71dd44dbc
commit
828acffc12
|
@ -225,7 +225,9 @@ def train(
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
msg.text("Extending component from base model '{}'".format(pipe))
|
msg.text("Extending component from base model '{}'".format(pipe))
|
||||||
disabled_pipes = nlp.disable_pipes([p for p in nlp.pipe_names if p not in pipeline])
|
disabled_pipes = nlp.disable_pipes(
|
||||||
|
[p for p in nlp.pipe_names if p not in pipeline]
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
msg.text("Starting with blank model '{}'".format(lang))
|
msg.text("Starting with blank model '{}'".format(lang))
|
||||||
lang_cls = util.get_lang_class(lang)
|
lang_cls = util.get_lang_class(lang)
|
||||||
|
@ -415,10 +417,10 @@ def train(
|
||||||
losses=losses,
|
losses=losses,
|
||||||
)
|
)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
msg.warn("Error during training")
|
err = "Error during training"
|
||||||
if init_tok2vec:
|
if init_tok2vec:
|
||||||
msg.warn("Did you provide the same parameters during 'train' as during 'pretrain'?")
|
err += " Did you provide the same parameters during 'train' as during 'pretrain'?"
|
||||||
msg.fail("Original error message: {}".format(e), exits=1)
|
msg.fail(err, "Original error message: {}".format(e), exits=1)
|
||||||
if raw_text:
|
if raw_text:
|
||||||
# If raw text is available, perform 'rehearsal' updates,
|
# If raw text is available, perform 'rehearsal' updates,
|
||||||
# which use unlabelled data to reduce overfitting.
|
# which use unlabelled data to reduce overfitting.
|
||||||
|
@ -546,7 +548,10 @@ def train(
|
||||||
)
|
)
|
||||||
break
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
msg.warn("Aborting and saving the final best model. Encountered exception: {}".format(e))
|
msg.warn(
|
||||||
|
"Aborting and saving the final best model. "
|
||||||
|
"Encountered exception: {}".format(e)
|
||||||
|
)
|
||||||
finally:
|
finally:
|
||||||
best_pipes = nlp.pipe_names
|
best_pipes = nlp.pipe_names
|
||||||
if disabled_pipes:
|
if disabled_pipes:
|
||||||
|
@ -563,13 +568,20 @@ def train(
|
||||||
final_meta["speed"].setdefault("gpu", None)
|
final_meta["speed"].setdefault("gpu", None)
|
||||||
# combine cpu and gpu speeds with the base model speeds
|
# combine cpu and gpu speeds with the base model speeds
|
||||||
if final_meta["speed"]["cpu"] and meta["speed"]["cpu"]:
|
if final_meta["speed"]["cpu"] and meta["speed"]["cpu"]:
|
||||||
speed = _get_total_speed([final_meta["speed"]["cpu"], meta["speed"]["cpu"]])
|
speed = _get_total_speed(
|
||||||
|
[final_meta["speed"]["cpu"], meta["speed"]["cpu"]]
|
||||||
|
)
|
||||||
final_meta["speed"]["cpu"] = speed
|
final_meta["speed"]["cpu"] = speed
|
||||||
if final_meta["speed"]["gpu"] and meta["speed"]["gpu"]:
|
if final_meta["speed"]["gpu"] and meta["speed"]["gpu"]:
|
||||||
speed = _get_total_speed([final_meta["speed"]["gpu"], meta["speed"]["gpu"]])
|
speed = _get_total_speed(
|
||||||
|
[final_meta["speed"]["gpu"], meta["speed"]["gpu"]]
|
||||||
|
)
|
||||||
final_meta["speed"]["gpu"] = speed
|
final_meta["speed"]["gpu"] = speed
|
||||||
# if there were no speeds to update, overwrite with meta
|
# if there were no speeds to update, overwrite with meta
|
||||||
if final_meta["speed"]["cpu"] is None and final_meta["speed"]["gpu"] is None:
|
if (
|
||||||
|
final_meta["speed"]["cpu"] is None
|
||||||
|
and final_meta["speed"]["gpu"] is None
|
||||||
|
):
|
||||||
final_meta["speed"].update(meta["speed"])
|
final_meta["speed"].update(meta["speed"])
|
||||||
# note: beam speeds are not combined with the base model
|
# note: beam speeds are not combined with the base model
|
||||||
if has_beam_widths:
|
if has_beam_widths:
|
||||||
|
|
|
@ -146,9 +146,14 @@ def parse_deps(orig_doc, options={}):
|
||||||
retokenizer.merge(span, attrs=attrs)
|
retokenizer.merge(span, attrs=attrs)
|
||||||
fine_grained = options.get("fine_grained")
|
fine_grained = options.get("fine_grained")
|
||||||
add_lemma = options.get("add_lemma")
|
add_lemma = options.get("add_lemma")
|
||||||
words = [{"text": w.text,
|
words = [
|
||||||
"tag": w.tag_ if fine_grained else w.pos_,
|
{
|
||||||
"lemma": w.lemma_ if add_lemma else None} for w in doc]
|
"text": w.text,
|
||||||
|
"tag": w.tag_ if fine_grained else w.pos_,
|
||||||
|
"lemma": w.lemma_ if add_lemma else None,
|
||||||
|
}
|
||||||
|
for w in doc
|
||||||
|
]
|
||||||
|
|
||||||
arcs = []
|
arcs = []
|
||||||
for word in doc:
|
for word in doc:
|
||||||
|
|
|
@ -3,7 +3,13 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_WORDS_LEMMA, TPL_DEP_ARCS, TPL_ENTS
|
from .templates import (
|
||||||
|
TPL_DEP_SVG,
|
||||||
|
TPL_DEP_WORDS,
|
||||||
|
TPL_DEP_WORDS_LEMMA,
|
||||||
|
TPL_DEP_ARCS,
|
||||||
|
TPL_ENTS,
|
||||||
|
)
|
||||||
from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE
|
from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE
|
||||||
from ..util import minify_html, escape_html, registry
|
from ..util import minify_html, escape_html, registry
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
|
@ -83,7 +89,10 @@ class DependencyRenderer(object):
|
||||||
self.width = self.offset_x + len(words) * self.distance
|
self.width = self.offset_x + len(words) * self.distance
|
||||||
self.height = self.offset_y + 3 * self.word_spacing
|
self.height = self.offset_y + 3 * self.word_spacing
|
||||||
self.id = render_id
|
self.id = render_id
|
||||||
words = [self.render_word(w["text"], w["tag"], w.get("lemma", None), i) for i, w in enumerate(words)]
|
words = [
|
||||||
|
self.render_word(w["text"], w["tag"], w.get("lemma", None), i)
|
||||||
|
for i, w in enumerate(words)
|
||||||
|
]
|
||||||
arcs = [
|
arcs = [
|
||||||
self.render_arrow(a["label"], a["start"], a["end"], a["dir"], i)
|
self.render_arrow(a["label"], a["start"], a["end"], a["dir"], i)
|
||||||
for i, a in enumerate(arcs)
|
for i, a in enumerate(arcs)
|
||||||
|
@ -101,7 +110,9 @@ class DependencyRenderer(object):
|
||||||
lang=self.lang,
|
lang=self.lang,
|
||||||
)
|
)
|
||||||
|
|
||||||
def render_word(self, text, tag, lemma, i,):
|
def render_word(
|
||||||
|
self, text, tag, lemma, i,
|
||||||
|
):
|
||||||
"""Render individual word.
|
"""Render individual word.
|
||||||
|
|
||||||
text (unicode): Word text.
|
text (unicode): Word text.
|
||||||
|
@ -115,7 +126,9 @@ class DependencyRenderer(object):
|
||||||
x = self.width - x
|
x = self.width - x
|
||||||
html_text = escape_html(text)
|
html_text = escape_html(text)
|
||||||
if lemma is not None:
|
if lemma is not None:
|
||||||
return TPL_DEP_WORDS_LEMMA.format(text=html_text, tag=tag, lemma=lemma, x=x, y=y)
|
return TPL_DEP_WORDS_LEMMA.format(
|
||||||
|
text=html_text, tag=tag, lemma=lemma, x=x, y=y
|
||||||
|
)
|
||||||
return TPL_DEP_WORDS.format(text=html_text, tag=tag, x=x, y=y)
|
return TPL_DEP_WORDS.format(text=html_text, tag=tag, x=x, y=y)
|
||||||
|
|
||||||
def render_arrow(self, label, start, end, direction, i):
|
def render_arrow(self, label, start, end, direction, i):
|
||||||
|
|
|
@ -112,7 +112,6 @@ class Warnings(object):
|
||||||
"in problems with the vocab further on in the pipeline.")
|
"in problems with the vocab further on in the pipeline.")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
class Errors(object):
|
class Errors(object):
|
||||||
E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
|
E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES
|
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES
|
||||||
from ..char_classes import LIST_CURRENCY, CURRENCY, UNITS, PUNCT
|
from ..char_classes import CURRENCY, UNITS, PUNCT
|
||||||
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||||
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
|
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
|
||||||
|
|
||||||
|
|
|
@ -10,5 +10,5 @@ Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
sentences = [
|
sentences = [
|
||||||
"bilbon ko castinga egin da eta nik jakin ez zuetako inork egin al du edota parte hartu duen ezagunik ba al du",
|
"bilbon ko castinga egin da eta nik jakin ez zuetako inork egin al du edota parte hartu duen ezagunik ba al du",
|
||||||
"gaur telebistan entzunda denok martetik gatoz hortaz martzianoak gara beno nire ustez batzuk beste batzuk baino martzianoagoak dira"
|
"gaur telebistan entzunda denok martetik gatoz hortaz martzianoak gara beno nire ustez batzuk beste batzuk baino martzianoagoak dira",
|
||||||
]
|
]
|
||||||
|
|
|
@ -59,7 +59,6 @@ behin
|
||||||
""".split()
|
""".split()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
if text.startswith(("+", "-", "±", "~")):
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
text = text[1:]
|
text = text[1:]
|
||||||
|
|
|
@ -5,7 +5,7 @@ from __future__ import unicode_literals
|
||||||
# https://www.ranks.nl/stopwords/basque
|
# https://www.ranks.nl/stopwords/basque
|
||||||
# https://www.mustgo.com/worldlanguages/basque/
|
# https://www.mustgo.com/worldlanguages/basque/
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
al
|
al
|
||||||
anitz
|
anitz
|
||||||
arabera
|
arabera
|
||||||
|
|
|
@ -16,7 +16,9 @@ _hyphen_suffixes += " " + _hyphen_suffixes.upper()
|
||||||
|
|
||||||
|
|
||||||
_prefixes = TOKENIZER_PREFIXES + [
|
_prefixes = TOKENIZER_PREFIXES + [
|
||||||
r"(?:({pe})[{el}])(?=[{a}])".format(a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision))
|
r"(?:({pe})[{el}])(?=[{a}])".format(
|
||||||
|
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
|
||||||
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
_suffixes = (
|
_suffixes = (
|
||||||
|
@ -33,7 +35,9 @@ _suffixes = (
|
||||||
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
|
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
|
||||||
),
|
),
|
||||||
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
||||||
r"(?<=[{a}])[{h}]({hs})".format(a=ALPHA, h=HYPHENS, hs=merge_chars(_hyphen_suffixes)),
|
r"(?<=[{a}])[{h}]({hs})".format(
|
||||||
|
a=ALPHA, h=HYPHENS, hs=merge_chars(_hyphen_suffixes)
|
||||||
|
),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -6,10 +6,10 @@ import re
|
||||||
from .punctuation import ELISION, HYPHENS
|
from .punctuation import ELISION, HYPHENS
|
||||||
from ..tokenizer_exceptions import URL_PATTERN
|
from ..tokenizer_exceptions import URL_PATTERN
|
||||||
from ..char_classes import ALPHA_LOWER, ALPHA
|
from ..char_classes import ALPHA_LOWER, ALPHA
|
||||||
from ...symbols import ORTH, LEMMA, TAG
|
from ...symbols import ORTH, LEMMA
|
||||||
|
|
||||||
# not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer
|
# not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer
|
||||||
#from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
|
# from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
|
||||||
FR_BASE_EXCEPTIONS = ["aujourd'hui", "Aujourd'hui"]
|
FR_BASE_EXCEPTIONS = ["aujourd'hui", "Aujourd'hui"]
|
||||||
|
|
||||||
|
|
||||||
|
@ -93,7 +93,7 @@ for verb, verb_lemma in [
|
||||||
for pronoun in ["elle", "il", "on"]:
|
for pronoun in ["elle", "il", "on"]:
|
||||||
token = "{}-t-{}".format(orth, pronoun)
|
token = "{}-t-{}".format(orth, pronoun)
|
||||||
_exc[token] = [
|
_exc[token] = [
|
||||||
{LEMMA: verb_lemma, ORTH: orth}, #, TAG: "VERB"},
|
{LEMMA: verb_lemma, ORTH: orth}, # , TAG: "VERB"},
|
||||||
{LEMMA: "t", ORTH: "-t"},
|
{LEMMA: "t", ORTH: "-t"},
|
||||||
{LEMMA: pronoun, ORTH: "-" + pronoun},
|
{LEMMA: pronoun, ORTH: "-" + pronoun},
|
||||||
]
|
]
|
||||||
|
@ -102,7 +102,7 @@ for verb, verb_lemma in [("est", "être")]:
|
||||||
for orth in [verb, verb.title()]:
|
for orth in [verb, verb.title()]:
|
||||||
token = "{}-ce".format(orth)
|
token = "{}-ce".format(orth)
|
||||||
_exc[token] = [
|
_exc[token] = [
|
||||||
{LEMMA: verb_lemma, ORTH: orth}, #, TAG: "VERB"},
|
{LEMMA: verb_lemma, ORTH: orth}, # , TAG: "VERB"},
|
||||||
{LEMMA: "ce", ORTH: "-ce"},
|
{LEMMA: "ce", ORTH: "-ce"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
|
||||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
|
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
|
||||||
from ..char_classes import ALPHA, HYPHENS, CONCAT_QUOTES
|
from ..char_classes import ALPHA, HYPHENS, CONCAT_QUOTES
|
||||||
from ..char_classes import ALPHA_LOWER, ALPHA_UPPER
|
from ..char_classes import ALPHA_LOWER, ALPHA_UPPER
|
||||||
|
@ -10,14 +10,7 @@ from ..char_classes import ALPHA_LOWER, ALPHA_UPPER
|
||||||
ELISION = "'’"
|
ELISION = "'’"
|
||||||
|
|
||||||
|
|
||||||
_prefixes = (
|
_prefixes = [r"'[0-9][0-9]", r"[0-9]+°"] + BASE_TOKENIZER_PREFIXES
|
||||||
[
|
|
||||||
r"'[0-9][0-9]",
|
|
||||||
r"[0-9]+°",
|
|
||||||
|
|
||||||
]
|
|
||||||
+ TOKENIZER_PREFIXES
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
_infixes = (
|
_infixes = (
|
||||||
|
@ -31,7 +24,7 @@ _infixes = (
|
||||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
r"(?<=[{a}])(?:{h})(?=[{al}])".format(a=ALPHA, h=HYPHENS, al=ALPHA_LOWER),
|
r"(?<=[{a}])(?:{h})(?=[{al}])".format(a=ALPHA, h=HYPHENS, al=ALPHA_LOWER),
|
||||||
r"(?<=[{a}0-9])[:<>=\/](?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}0-9])[:<>=\/](?=[{a}])".format(a=ALPHA),
|
||||||
r"(?<=[{a}][{el}])(?=[{a}0-9\"])".format(a=ALPHA, el=ELISION)
|
r"(?<=[{a}][{el}])(?=[{a}0-9\"])".format(a=ALPHA, el=ELISION),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -10,7 +10,7 @@ _exc = {
|
||||||
"l'art.": [{ORTH: "l'"}, {ORTH: "art."}],
|
"l'art.": [{ORTH: "l'"}, {ORTH: "art."}],
|
||||||
"nell'art.": [{ORTH: "nell'"}, {ORTH: "art."}],
|
"nell'art.": [{ORTH: "nell'"}, {ORTH: "art."}],
|
||||||
"po'": [{ORTH: "po'", LEMMA: "poco"}],
|
"po'": [{ORTH: "po'", LEMMA: "poco"}],
|
||||||
"sett..": [{ORTH: "sett."}, {ORTH: "."}]
|
"sett..": [{ORTH: "sett."}, {ORTH: "."}],
|
||||||
}
|
}
|
||||||
|
|
||||||
for orth in [
|
for orth in [
|
||||||
|
@ -32,7 +32,7 @@ for orth in [
|
||||||
"col.",
|
"col.",
|
||||||
"Cost.",
|
"Cost.",
|
||||||
"d.C.",
|
"d.C.",
|
||||||
'de"'
|
'de"',
|
||||||
"distr.",
|
"distr.",
|
||||||
"E'",
|
"E'",
|
||||||
"ecc.",
|
"ecc.",
|
||||||
|
|
|
@ -8,7 +8,7 @@ a à â a-a a-e a-i a-o aiva aloa an ancheu ancon apreuvo ascì atra atre atri a
|
||||||
|
|
||||||
bella belle belli bello ben
|
bella belle belli bello ben
|
||||||
|
|
||||||
ch' che chì chi ciù co-a co-e co-i co-o comm' comme con cösa coscì cöse
|
ch' che chì chi ciù co-a co-e co-i co-o comm' comme con cösa coscì cöse
|
||||||
|
|
||||||
d' da da-a da-e da-i da-o dapeu de delongo derê di do doe doî donde dòppo
|
d' da da-a da-e da-i da-o dapeu de delongo derê di do doe doî donde dòppo
|
||||||
|
|
||||||
|
|
|
@ -44,7 +44,7 @@ for prep, prep_lemma in [
|
||||||
("s’", "se"),
|
("s’", "se"),
|
||||||
]:
|
]:
|
||||||
for prefix_orth in [prefix, prefix.capitalize()]:
|
for prefix_orth in [prefix, prefix.capitalize()]:
|
||||||
_exc[prefix_orth+prep] = [
|
_exc[prefix_orth + prep] = [
|
||||||
{ORTH: prefix_orth, LEMMA: prefix_lemma},
|
{ORTH: prefix_orth, LEMMA: prefix_lemma},
|
||||||
{ORTH: prep, LEMMA: prep_lemma},
|
{ORTH: prep, LEMMA: prep_lemma},
|
||||||
]
|
]
|
||||||
|
|
|
@ -29,7 +29,9 @@ class LithuanianDefaults(Language.Defaults):
|
||||||
|
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
mod_base_exceptions = {exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")}
|
mod_base_exceptions = {
|
||||||
|
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
|
||||||
|
}
|
||||||
del mod_base_exceptions["8)"]
|
del mod_base_exceptions["8)"]
|
||||||
tokenizer_exceptions = update_exc(mod_base_exceptions, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(mod_base_exceptions, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
|
@ -8,262 +8,262 @@ _exc = {}
|
||||||
for orth in [
|
for orth in [
|
||||||
"n-tosios",
|
"n-tosios",
|
||||||
"?!",
|
"?!",
|
||||||
# "G.",
|
# "G.",
|
||||||
# "J. E.",
|
# "J. E.",
|
||||||
# "J. Em.",
|
# "J. Em.",
|
||||||
# "J.E.",
|
# "J.E.",
|
||||||
# "J.Em.",
|
# "J.Em.",
|
||||||
# "K.",
|
# "K.",
|
||||||
# "N.",
|
# "N.",
|
||||||
# "V.",
|
# "V.",
|
||||||
# "Vt.",
|
# "Vt.",
|
||||||
# "a.",
|
# "a.",
|
||||||
# "a.k.",
|
# "a.k.",
|
||||||
# "a.s.",
|
# "a.s.",
|
||||||
# "adv.",
|
# "adv.",
|
||||||
# "akad.",
|
# "akad.",
|
||||||
# "aklg.",
|
# "aklg.",
|
||||||
# "akt.",
|
# "akt.",
|
||||||
# "al.",
|
# "al.",
|
||||||
# "ang.",
|
# "ang.",
|
||||||
# "angl.",
|
# "angl.",
|
||||||
# "aps.",
|
# "aps.",
|
||||||
# "apskr.",
|
# "apskr.",
|
||||||
# "apyg.",
|
# "apyg.",
|
||||||
# "arbat.",
|
# "arbat.",
|
||||||
# "asist.",
|
# "asist.",
|
||||||
# "asm.",
|
# "asm.",
|
||||||
# "asm.k.",
|
# "asm.k.",
|
||||||
# "asmv.",
|
# "asmv.",
|
||||||
# "atk.",
|
# "atk.",
|
||||||
# "atsak.",
|
# "atsak.",
|
||||||
# "atsisk.",
|
# "atsisk.",
|
||||||
# "atsisk.sąsk.",
|
# "atsisk.sąsk.",
|
||||||
# "atv.",
|
# "atv.",
|
||||||
# "aut.",
|
# "aut.",
|
||||||
# "avd.",
|
# "avd.",
|
||||||
# "b.k.",
|
# "b.k.",
|
||||||
# "baud.",
|
# "baud.",
|
||||||
# "biol.",
|
# "biol.",
|
||||||
# "bkl.",
|
# "bkl.",
|
||||||
# "bot.",
|
# "bot.",
|
||||||
# "bt.",
|
# "bt.",
|
||||||
# "buv.",
|
# "buv.",
|
||||||
# "ch.",
|
# "ch.",
|
||||||
# "chem.",
|
# "chem.",
|
||||||
# "corp.",
|
# "corp.",
|
||||||
# "d.",
|
# "d.",
|
||||||
# "dab.",
|
# "dab.",
|
||||||
# "dail.",
|
# "dail.",
|
||||||
# "dek.",
|
# "dek.",
|
||||||
# "deš.",
|
# "deš.",
|
||||||
# "dir.",
|
# "dir.",
|
||||||
# "dirig.",
|
# "dirig.",
|
||||||
# "doc.",
|
# "doc.",
|
||||||
# "dol.",
|
# "dol.",
|
||||||
# "dr.",
|
# "dr.",
|
||||||
# "drp.",
|
# "drp.",
|
||||||
# "dvit.",
|
# "dvit.",
|
||||||
# "dėst.",
|
# "dėst.",
|
||||||
# "dš.",
|
# "dš.",
|
||||||
# "dž.",
|
# "dž.",
|
||||||
# "e.b.",
|
# "e.b.",
|
||||||
# "e.bankas",
|
# "e.bankas",
|
||||||
# "e.p.",
|
# "e.p.",
|
||||||
# "e.parašas",
|
# "e.parašas",
|
||||||
# "e.paštas",
|
# "e.paštas",
|
||||||
# "e.v.",
|
# "e.v.",
|
||||||
# "e.valdžia",
|
# "e.valdžia",
|
||||||
# "egz.",
|
# "egz.",
|
||||||
# "eil.",
|
# "eil.",
|
||||||
# "ekon.",
|
# "ekon.",
|
||||||
# "el.",
|
# "el.",
|
||||||
# "el.bankas",
|
# "el.bankas",
|
||||||
# "el.p.",
|
# "el.p.",
|
||||||
# "el.parašas",
|
# "el.parašas",
|
||||||
# "el.paštas",
|
# "el.paštas",
|
||||||
# "el.valdžia",
|
# "el.valdžia",
|
||||||
# "etc.",
|
# "etc.",
|
||||||
# "ež.",
|
# "ež.",
|
||||||
# "fak.",
|
# "fak.",
|
||||||
# "faks.",
|
# "faks.",
|
||||||
# "feat.",
|
# "feat.",
|
||||||
# "filol.",
|
# "filol.",
|
||||||
# "filos.",
|
# "filos.",
|
||||||
# "g.",
|
# "g.",
|
||||||
# "gen.",
|
# "gen.",
|
||||||
# "geol.",
|
# "geol.",
|
||||||
# "gerb.",
|
# "gerb.",
|
||||||
# "gim.",
|
# "gim.",
|
||||||
# "gr.",
|
# "gr.",
|
||||||
# "gv.",
|
# "gv.",
|
||||||
# "gyd.",
|
# "gyd.",
|
||||||
# "gyv.",
|
# "gyv.",
|
||||||
# "habil.",
|
# "habil.",
|
||||||
# "inc.",
|
# "inc.",
|
||||||
# "insp.",
|
# "insp.",
|
||||||
# "inž.",
|
# "inž.",
|
||||||
# "ir pan.",
|
# "ir pan.",
|
||||||
# "ir t. t.",
|
# "ir t. t.",
|
||||||
# "isp.",
|
# "isp.",
|
||||||
# "istor.",
|
# "istor.",
|
||||||
# "it.",
|
# "it.",
|
||||||
# "just.",
|
# "just.",
|
||||||
# "k.",
|
# "k.",
|
||||||
# "k. a.",
|
# "k. a.",
|
||||||
# "k.a.",
|
# "k.a.",
|
||||||
# "kab.",
|
# "kab.",
|
||||||
# "kand.",
|
# "kand.",
|
||||||
# "kart.",
|
# "kart.",
|
||||||
# "kat.",
|
# "kat.",
|
||||||
# "ketv.",
|
# "ketv.",
|
||||||
# "kh.",
|
# "kh.",
|
||||||
# "kl.",
|
# "kl.",
|
||||||
# "kln.",
|
# "kln.",
|
||||||
# "km.",
|
# "km.",
|
||||||
# "kn.",
|
# "kn.",
|
||||||
# "koresp.",
|
# "koresp.",
|
||||||
# "kpt.",
|
# "kpt.",
|
||||||
# "kr.",
|
# "kr.",
|
||||||
# "kt.",
|
# "kt.",
|
||||||
# "kub.",
|
# "kub.",
|
||||||
# "kun.",
|
# "kun.",
|
||||||
# "kv.",
|
# "kv.",
|
||||||
# "kyš.",
|
# "kyš.",
|
||||||
# "l. e. p.",
|
# "l. e. p.",
|
||||||
# "l.e.p.",
|
# "l.e.p.",
|
||||||
# "lenk.",
|
# "lenk.",
|
||||||
# "liet.",
|
# "liet.",
|
||||||
# "lot.",
|
# "lot.",
|
||||||
# "lt.",
|
# "lt.",
|
||||||
# "ltd.",
|
# "ltd.",
|
||||||
# "ltn.",
|
# "ltn.",
|
||||||
# "m.",
|
# "m.",
|
||||||
# "m.e..",
|
# "m.e..",
|
||||||
# "m.m.",
|
# "m.m.",
|
||||||
# "mat.",
|
# "mat.",
|
||||||
# "med.",
|
# "med.",
|
||||||
# "mgnt.",
|
# "mgnt.",
|
||||||
# "mgr.",
|
# "mgr.",
|
||||||
# "min.",
|
# "min.",
|
||||||
# "mjr.",
|
# "mjr.",
|
||||||
# "ml.",
|
# "ml.",
|
||||||
# "mln.",
|
# "mln.",
|
||||||
# "mlrd.",
|
# "mlrd.",
|
||||||
# "mob.",
|
# "mob.",
|
||||||
# "mok.",
|
# "mok.",
|
||||||
# "moksl.",
|
# "moksl.",
|
||||||
# "mokyt.",
|
# "mokyt.",
|
||||||
# "mot.",
|
# "mot.",
|
||||||
# "mr.",
|
# "mr.",
|
||||||
# "mst.",
|
# "mst.",
|
||||||
# "mstl.",
|
# "mstl.",
|
||||||
# "mėn.",
|
# "mėn.",
|
||||||
# "nkt.",
|
# "nkt.",
|
||||||
# "no.",
|
# "no.",
|
||||||
# "nr.",
|
# "nr.",
|
||||||
# "ntk.",
|
# "ntk.",
|
||||||
# "nuotr.",
|
# "nuotr.",
|
||||||
# "op.",
|
# "op.",
|
||||||
# "org.",
|
# "org.",
|
||||||
# "orig.",
|
# "orig.",
|
||||||
# "p.",
|
# "p.",
|
||||||
# "p.d.",
|
# "p.d.",
|
||||||
# "p.m.e.",
|
# "p.m.e.",
|
||||||
# "p.s.",
|
# "p.s.",
|
||||||
# "pab.",
|
# "pab.",
|
||||||
# "pan.",
|
# "pan.",
|
||||||
# "past.",
|
# "past.",
|
||||||
# "pav.",
|
# "pav.",
|
||||||
# "pavad.",
|
# "pavad.",
|
||||||
# "per.",
|
# "per.",
|
||||||
# "perd.",
|
# "perd.",
|
||||||
# "pirm.",
|
# "pirm.",
|
||||||
# "pl.",
|
# "pl.",
|
||||||
# "plg.",
|
# "plg.",
|
||||||
# "plk.",
|
# "plk.",
|
||||||
# "pr.",
|
# "pr.",
|
||||||
# "pr.Kr.",
|
# "pr.Kr.",
|
||||||
# "pranc.",
|
# "pranc.",
|
||||||
# "proc.",
|
# "proc.",
|
||||||
# "prof.",
|
# "prof.",
|
||||||
# "prom.",
|
# "prom.",
|
||||||
# "prot.",
|
# "prot.",
|
||||||
# "psl.",
|
# "psl.",
|
||||||
# "pss.",
|
# "pss.",
|
||||||
# "pvz.",
|
# "pvz.",
|
||||||
# "pšt.",
|
# "pšt.",
|
||||||
# "r.",
|
# "r.",
|
||||||
# "raj.",
|
# "raj.",
|
||||||
# "red.",
|
# "red.",
|
||||||
# "rez.",
|
# "rez.",
|
||||||
# "rež.",
|
# "rež.",
|
||||||
# "rus.",
|
# "rus.",
|
||||||
# "rš.",
|
# "rš.",
|
||||||
# "s.",
|
# "s.",
|
||||||
# "sav.",
|
# "sav.",
|
||||||
# "saviv.",
|
# "saviv.",
|
||||||
# "sek.",
|
# "sek.",
|
||||||
# "sekr.",
|
# "sekr.",
|
||||||
# "sen.",
|
# "sen.",
|
||||||
# "sh.",
|
# "sh.",
|
||||||
# "sk.",
|
# "sk.",
|
||||||
# "skg.",
|
# "skg.",
|
||||||
# "skv.",
|
# "skv.",
|
||||||
# "skyr.",
|
# "skyr.",
|
||||||
# "sp.",
|
# "sp.",
|
||||||
# "spec.",
|
# "spec.",
|
||||||
# "sr.",
|
# "sr.",
|
||||||
# "st.",
|
# "st.",
|
||||||
# "str.",
|
# "str.",
|
||||||
# "stud.",
|
# "stud.",
|
||||||
# "sąs.",
|
# "sąs.",
|
||||||
# "t.",
|
# "t.",
|
||||||
# "t. p.",
|
# "t. p.",
|
||||||
# "t. y.",
|
# "t. y.",
|
||||||
# "t.p.",
|
# "t.p.",
|
||||||
# "t.t.",
|
# "t.t.",
|
||||||
# "t.y.",
|
# "t.y.",
|
||||||
# "techn.",
|
# "techn.",
|
||||||
# "tel.",
|
# "tel.",
|
||||||
# "teol.",
|
# "teol.",
|
||||||
# "th.",
|
# "th.",
|
||||||
# "tir.",
|
# "tir.",
|
||||||
# "trit.",
|
# "trit.",
|
||||||
# "trln.",
|
# "trln.",
|
||||||
# "tšk.",
|
# "tšk.",
|
||||||
# "tūks.",
|
# "tūks.",
|
||||||
# "tūkst.",
|
# "tūkst.",
|
||||||
# "up.",
|
# "up.",
|
||||||
# "upl.",
|
# "upl.",
|
||||||
# "v.s.",
|
# "v.s.",
|
||||||
# "vad.",
|
# "vad.",
|
||||||
# "val.",
|
# "val.",
|
||||||
# "valg.",
|
# "valg.",
|
||||||
# "ved.",
|
# "ved.",
|
||||||
# "vert.",
|
# "vert.",
|
||||||
# "vet.",
|
# "vet.",
|
||||||
# "vid.",
|
# "vid.",
|
||||||
# "virš.",
|
# "virš.",
|
||||||
# "vlsč.",
|
# "vlsč.",
|
||||||
# "vnt.",
|
# "vnt.",
|
||||||
# "vok.",
|
# "vok.",
|
||||||
# "vs.",
|
# "vs.",
|
||||||
# "vtv.",
|
# "vtv.",
|
||||||
# "vv.",
|
# "vv.",
|
||||||
# "vyr.",
|
# "vyr.",
|
||||||
# "vyresn.",
|
# "vyresn.",
|
||||||
# "zool.",
|
# "zool.",
|
||||||
# "Įn",
|
# "Įn",
|
||||||
# "įl.",
|
# "įl.",
|
||||||
# "š.m.",
|
# "š.m.",
|
||||||
# "šnek.",
|
# "šnek.",
|
||||||
# "šv.",
|
# "šv.",
|
||||||
# "švč.",
|
# "švč.",
|
||||||
# "ž.ū.",
|
# "ž.ū.",
|
||||||
# "žin.",
|
# "žin.",
|
||||||
# "žml.",
|
# "žml.",
|
||||||
# "žr.",
|
# "žr.",
|
||||||
]:
|
]:
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
|
|
@ -24,7 +24,6 @@ _prefixes = (
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
_infixes = (
|
_infixes = (
|
||||||
LIST_ELLIPSES
|
LIST_ELLIPSES
|
||||||
+ _list_icons
|
+ _list_icons
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...symbols import ORTH, NORM
|
from ...symbols import ORTH
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -37,7 +37,7 @@ URL_PATTERN = (
|
||||||
r"|"
|
r"|"
|
||||||
# host & domain names
|
# host & domain names
|
||||||
# mods: match is case-sensitive, so include [A-Z]
|
# mods: match is case-sensitive, so include [A-Z]
|
||||||
"(?:"
|
"(?:" # noqa
|
||||||
"(?:"
|
"(?:"
|
||||||
"[A-Za-z0-9\u00a1-\uffff]"
|
"[A-Za-z0-9\u00a1-\uffff]"
|
||||||
"[A-Za-z0-9\u00a1-\uffff_-]{0,62}"
|
"[A-Za-z0-9\u00a1-\uffff_-]{0,62}"
|
||||||
|
|
|
@ -612,7 +612,7 @@ class Language(object):
|
||||||
link_vectors_to_models(self.vocab)
|
link_vectors_to_models(self.vocab)
|
||||||
if self.vocab.vectors.data.shape[1]:
|
if self.vocab.vectors.data.shape[1]:
|
||||||
cfg["pretrained_vectors"] = self.vocab.vectors.name
|
cfg["pretrained_vectors"] = self.vocab.vectors.name
|
||||||
cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
cfg["pretrained_dims"] = self.vocab.vectors.data.shape[1]
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
sgd = create_default_optimizer(Model.ops)
|
sgd = create_default_optimizer(Model.ops)
|
||||||
self._optimizer = sgd
|
self._optimizer = sgd
|
||||||
|
@ -857,7 +857,14 @@ class Language(object):
|
||||||
procs = [
|
procs = [
|
||||||
mp.Process(
|
mp.Process(
|
||||||
target=_apply_pipes,
|
target=_apply_pipes,
|
||||||
args=(self.make_doc, pipes, rch, sch, Underscore.get_state(), load_nlp.VECTORS),
|
args=(
|
||||||
|
self.make_doc,
|
||||||
|
pipes,
|
||||||
|
rch,
|
||||||
|
sch,
|
||||||
|
Underscore.get_state(),
|
||||||
|
load_nlp.VECTORS,
|
||||||
|
),
|
||||||
)
|
)
|
||||||
for rch, sch in zip(texts_q, bytedocs_send_ch)
|
for rch, sch in zip(texts_q, bytedocs_send_ch)
|
||||||
]
|
]
|
||||||
|
|
|
@ -222,11 +222,9 @@ class EntityRuler(object):
|
||||||
for label, pattern, ent_id in zip(
|
for label, pattern, ent_id in zip(
|
||||||
phrase_pattern_labels,
|
phrase_pattern_labels,
|
||||||
self.nlp.pipe(phrase_pattern_texts),
|
self.nlp.pipe(phrase_pattern_texts),
|
||||||
phrase_pattern_ids
|
phrase_pattern_ids,
|
||||||
):
|
):
|
||||||
phrase_pattern = {
|
phrase_pattern = {"label": label, "pattern": pattern, "id": ent_id}
|
||||||
"label": label, "pattern": pattern, "id": ent_id
|
|
||||||
}
|
|
||||||
if ent_id:
|
if ent_id:
|
||||||
phrase_pattern["id"] = ent_id
|
phrase_pattern["id"] = ent_id
|
||||||
phrase_patterns.append(phrase_pattern)
|
phrase_patterns.append(phrase_pattern)
|
||||||
|
|
|
@ -71,9 +71,7 @@ def test_doc_array_to_from_string_attrs(en_vocab, attrs):
|
||||||
def test_doc_array_idx(en_vocab):
|
def test_doc_array_idx(en_vocab):
|
||||||
"""Test that Doc.to_array can retrieve token start indices"""
|
"""Test that Doc.to_array can retrieve token start indices"""
|
||||||
words = ["An", "example", "sentence"]
|
words = ["An", "example", "sentence"]
|
||||||
doc = Doc(en_vocab, words=words)
|
|
||||||
offsets = Doc(en_vocab, words=words).to_array("IDX")
|
offsets = Doc(en_vocab, words=words).to_array("IDX")
|
||||||
|
|
||||||
assert offsets[0] == 0
|
assert offsets[0] == 0
|
||||||
assert offsets[1] == 3
|
assert offsets[1] == 3
|
||||||
assert offsets[2] == 11
|
assert offsets[2] == 11
|
||||||
|
|
|
@ -59,7 +59,7 @@ def test_da_tokenizer_norm_exceptions(da_tokenizer, text, norm):
|
||||||
("Sprogteknologi a/s", 2),
|
("Sprogteknologi a/s", 2),
|
||||||
("De boede i A/B Bellevue", 5),
|
("De boede i A/B Bellevue", 5),
|
||||||
# note: skipping due to weirdness in UD_Danish-DDT
|
# note: skipping due to weirdness in UD_Danish-DDT
|
||||||
#("Rotorhastigheden er 3400 o/m.", 5),
|
# ("Rotorhastigheden er 3400 o/m.", 5),
|
||||||
("Jeg købte billet t/r.", 5),
|
("Jeg købte billet t/r.", 5),
|
||||||
("Murerarbejdsmand m/k søges", 3),
|
("Murerarbejdsmand m/k søges", 3),
|
||||||
("Netværket kører over TCP/IP", 4),
|
("Netværket kører over TCP/IP", 4),
|
||||||
|
|
|
@ -10,7 +10,13 @@ def test_eu_tokenizer_handles_long_text(eu_tokenizer):
|
||||||
assert len(tokens) == 5
|
assert len(tokens) == 5
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("text,length", [("milesker ederra joan zen hitzaldia plazer hutsa", 7), ("astelehen guztia sofan pasau biot", 5)])
|
@pytest.mark.parametrize(
|
||||||
|
"text,length",
|
||||||
|
[
|
||||||
|
("milesker ederra joan zen hitzaldia plazer hutsa", 7),
|
||||||
|
("astelehen guztia sofan pasau biot", 5),
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_eu_tokenizer_handles_cnts(eu_tokenizer, text, length):
|
def test_eu_tokenizer_handles_cnts(eu_tokenizer, text, length):
|
||||||
tokens = eu_tokenizer(text)
|
tokens = eu_tokenizer(text)
|
||||||
assert len(tokens) == length
|
assert len(tokens) == length
|
||||||
|
|
|
@ -297,12 +297,7 @@ WIKI_TESTS = [
|
||||||
]
|
]
|
||||||
|
|
||||||
EXTRA_TESTS = (
|
EXTRA_TESTS = (
|
||||||
DOT_TESTS
|
DOT_TESTS + QUOTE_TESTS + NUMBER_TESTS + HYPHEN_TESTS + WIKI_TESTS + TYPO_TESTS
|
||||||
+ QUOTE_TESTS
|
|
||||||
+ NUMBER_TESTS
|
|
||||||
+ HYPHEN_TESTS
|
|
||||||
+ WIKI_TESTS
|
|
||||||
+ TYPO_TESTS
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# normal: default tests + 10% of extra tests
|
# normal: default tests + 10% of extra tests
|
||||||
|
@ -311,7 +306,14 @@ TESTS.extend([x for i, x in enumerate(EXTRA_TESTS) if i % 10 == 0])
|
||||||
|
|
||||||
# slow: remaining 90% of extra tests
|
# slow: remaining 90% of extra tests
|
||||||
SLOW_TESTS = [x for i, x in enumerate(EXTRA_TESTS) if i % 10 != 0]
|
SLOW_TESTS = [x for i, x in enumerate(EXTRA_TESTS) if i % 10 != 0]
|
||||||
TESTS.extend([pytest.param(x[0], x[1], marks=pytest.mark.slow()) if not isinstance(x[0], tuple) else x for x in SLOW_TESTS])
|
TESTS.extend(
|
||||||
|
[
|
||||||
|
pytest.param(x[0], x[1], marks=pytest.mark.slow())
|
||||||
|
if not isinstance(x[0], tuple)
|
||||||
|
else x
|
||||||
|
for x in SLOW_TESTS
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("text,expected_tokens", TESTS)
|
@pytest.mark.parametrize("text,expected_tokens", TESTS)
|
||||||
|
|
|
@ -6,7 +6,8 @@ import re
|
||||||
from mock import Mock
|
from mock import Mock
|
||||||
from spacy.matcher import Matcher, DependencyMatcher
|
from spacy.matcher import Matcher, DependencyMatcher
|
||||||
from spacy.tokens import Doc, Token
|
from spacy.tokens import Doc, Token
|
||||||
from ..doc.test_underscore import clean_underscore
|
|
||||||
|
from ..doc.test_underscore import clean_underscore # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
|
|
@ -152,10 +152,5 @@ def test_entity_ruler_validate(nlp):
|
||||||
|
|
||||||
def test_entity_ruler_properties(nlp, patterns):
|
def test_entity_ruler_properties(nlp, patterns):
|
||||||
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
||||||
assert sorted(ruler.labels) == sorted([
|
assert sorted(ruler.labels) == sorted(["HELLO", "BYE", "COMPLEX", "TECH_ORG"])
|
||||||
"HELLO",
|
|
||||||
"BYE",
|
|
||||||
"COMPLEX",
|
|
||||||
"TECH_ORG"
|
|
||||||
])
|
|
||||||
assert sorted(ruler.ent_ids) == ["a1", "a2"]
|
assert sorted(ruler.ent_ids) == ["a1", "a2"]
|
||||||
|
|
|
@ -23,4 +23,3 @@ def test_issue4725():
|
||||||
docs = ["Kurt is in London."] * 10
|
docs = ["Kurt is in London."] * 10
|
||||||
for _ in nlp.pipe(docs, batch_size=2, n_process=2):
|
for _ in nlp.pipe(docs, batch_size=2, n_process=2):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
|
@ -9,11 +9,12 @@ def test_issue4849():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
|
|
||||||
ruler = EntityRuler(
|
ruler = EntityRuler(
|
||||||
nlp, patterns=[
|
nlp,
|
||||||
{"label": "PERSON", "pattern": 'joe biden', "id": 'joe-biden'},
|
patterns=[
|
||||||
{"label": "PERSON", "pattern": 'bernie sanders', "id": 'bernie-sanders'},
|
{"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
|
||||||
|
{"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
|
||||||
],
|
],
|
||||||
phrase_matcher_attr="LOWER"
|
phrase_matcher_attr="LOWER",
|
||||||
)
|
)
|
||||||
|
|
||||||
nlp.add_pipe(ruler)
|
nlp.add_pipe(ruler)
|
||||||
|
@ -27,10 +28,10 @@ def test_issue4849():
|
||||||
count_ents = 0
|
count_ents = 0
|
||||||
for doc in nlp.pipe([text], n_process=1):
|
for doc in nlp.pipe([text], n_process=1):
|
||||||
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
|
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
|
||||||
assert(count_ents == 2)
|
assert count_ents == 2
|
||||||
|
|
||||||
# USING 2 PROCESSES
|
# USING 2 PROCESSES
|
||||||
count_ents = 0
|
count_ents = 0
|
||||||
for doc in nlp.pipe([text], n_process=2):
|
for doc in nlp.pipe([text], n_process=2):
|
||||||
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
|
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
|
||||||
assert (count_ents == 2)
|
assert count_ents == 2
|
||||||
|
|
|
@ -22,7 +22,7 @@ def test_serialize_custom_tokenizer(en_vocab, en_tokenizer):
|
||||||
tokenizer_bytes = tokenizer.to_bytes()
|
tokenizer_bytes = tokenizer.to_bytes()
|
||||||
Tokenizer(en_vocab).from_bytes(tokenizer_bytes)
|
Tokenizer(en_vocab).from_bytes(tokenizer_bytes)
|
||||||
|
|
||||||
tokenizer = Tokenizer(en_vocab, rules={"ABC.": [{"ORTH": "ABC", "ORTH": "."}]})
|
tokenizer = Tokenizer(en_vocab, rules={"ABC.": [{"ORTH": "ABC"}, {"ORTH": "."}]})
|
||||||
tokenizer.rules = {}
|
tokenizer.rules = {}
|
||||||
tokenizer_bytes = tokenizer.to_bytes()
|
tokenizer_bytes = tokenizer.to_bytes()
|
||||||
tokenizer_reloaded = Tokenizer(en_vocab).from_bytes(tokenizer_bytes)
|
tokenizer_reloaded = Tokenizer(en_vocab).from_bytes(tokenizer_bytes)
|
||||||
|
|
|
@ -28,7 +28,9 @@ def make_tempdir():
|
||||||
shutil.rmtree(path2str(d))
|
shutil.rmtree(path2str(d))
|
||||||
|
|
||||||
|
|
||||||
def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None):
|
def get_doc(
|
||||||
|
vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None
|
||||||
|
):
|
||||||
"""Create Doc object from given vocab, words and annotations."""
|
"""Create Doc object from given vocab, words and annotations."""
|
||||||
if deps and not heads:
|
if deps and not heads:
|
||||||
heads = [0] * len(deps)
|
heads = [0] * len(deps)
|
||||||
|
@ -60,7 +62,7 @@ def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=No
|
||||||
if attrs.ndim == 1:
|
if attrs.ndim == 1:
|
||||||
attrs[i] = heads[i]
|
attrs[i] = heads[i]
|
||||||
else:
|
else:
|
||||||
attrs[i,j] = heads[i]
|
attrs[i, j] = heads[i]
|
||||||
else:
|
else:
|
||||||
for i in range(len(words)):
|
for i in range(len(words)):
|
||||||
if attrs.ndim == 1:
|
if attrs.ndim == 1:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user