mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
set encodings explicitly to utf8 (#4551)
This commit is contained in:
parent
9e210fa7fd
commit
33ba9ff464
|
@ -84,7 +84,7 @@ def read_conllu(file_):
|
||||||
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
||||||
if text_loc.parts[-1].endswith(".conllu"):
|
if text_loc.parts[-1].endswith(".conllu"):
|
||||||
docs = []
|
docs = []
|
||||||
with text_loc.open() as file_:
|
with text_loc.open(encoding="utf8") as file_:
|
||||||
for conllu_doc in read_conllu(file_):
|
for conllu_doc in read_conllu(file_):
|
||||||
for conllu_sent in conllu_doc:
|
for conllu_sent in conllu_doc:
|
||||||
words = [line[1] for line in conllu_sent]
|
words = [line[1] for line in conllu_sent]
|
||||||
|
|
|
@ -203,7 +203,7 @@ def golds_to_gold_tuples(docs, golds):
|
||||||
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
||||||
if text_loc.parts[-1].endswith(".conllu"):
|
if text_loc.parts[-1].endswith(".conllu"):
|
||||||
docs = []
|
docs = []
|
||||||
with text_loc.open() as file_:
|
with text_loc.open(encoding="utf8") as file_:
|
||||||
for conllu_doc in read_conllu(file_):
|
for conllu_doc in read_conllu(file_):
|
||||||
for conllu_sent in conllu_doc:
|
for conllu_sent in conllu_doc:
|
||||||
words = [line[1] for line in conllu_sent]
|
words = [line[1] for line in conllu_sent]
|
||||||
|
@ -378,7 +378,7 @@ def _load_pretrained_tok2vec(nlp, loc):
|
||||||
"""Load pretrained weights for the 'token-to-vector' part of the component
|
"""Load pretrained weights for the 'token-to-vector' part of the component
|
||||||
models, which is typically a CNN. See 'spacy pretrain'. Experimental.
|
models, which is typically a CNN. See 'spacy pretrain'. Experimental.
|
||||||
"""
|
"""
|
||||||
with Path(loc).open("rb") as file_:
|
with Path(loc).open("rb", encoding="utf8") as file_:
|
||||||
weights_data = file_.read()
|
weights_data = file_.read()
|
||||||
loaded = []
|
loaded = []
|
||||||
for name, component in nlp.pipeline:
|
for name, component in nlp.pipeline:
|
||||||
|
@ -519,8 +519,8 @@ def main(
|
||||||
for i in range(config.nr_epoch):
|
for i in range(config.nr_epoch):
|
||||||
docs, golds = read_data(
|
docs, golds = read_data(
|
||||||
nlp,
|
nlp,
|
||||||
paths.train.conllu.open(),
|
paths.train.conllu.open(encoding="utf8"),
|
||||||
paths.train.text.open(),
|
paths.train.text.open(encoding="utf8"),
|
||||||
max_doc_length=config.max_doc_length,
|
max_doc_length=config.max_doc_length,
|
||||||
limit=limit,
|
limit=limit,
|
||||||
oracle_segments=use_oracle_segments,
|
oracle_segments=use_oracle_segments,
|
||||||
|
@ -560,7 +560,7 @@ def main(
|
||||||
|
|
||||||
def _render_parses(i, to_render):
|
def _render_parses(i, to_render):
|
||||||
to_render[0].user_data["title"] = "Batch %d" % i
|
to_render[0].user_data["title"] = "Batch %d" % i
|
||||||
with Path("/tmp/parses.html").open("w") as file_:
|
with Path("/tmp/parses.html").open("w", encoding="utf8") as file_:
|
||||||
html = displacy.render(to_render[:5], style="dep", page=True)
|
html = displacy.render(to_render[:5], style="dep", page=True)
|
||||||
file_.write(html)
|
file_.write(html)
|
||||||
|
|
||||||
|
|
|
@ -421,7 +421,7 @@ def env_opt(name, default=None):
|
||||||
|
|
||||||
def read_regex(path):
|
def read_regex(path):
|
||||||
path = ensure_path(path)
|
path = ensure_path(path)
|
||||||
with path.open() as file_:
|
with path.open(encoding="utf8") as file_:
|
||||||
entries = file_.read().split("\n")
|
entries = file_.read().split("\n")
|
||||||
expression = "|".join(
|
expression = "|".join(
|
||||||
["^" + re.escape(piece) for piece in entries if piece.strip()]
|
["^" + re.escape(piece) for piece in entries if piece.strip()]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user