set encodings explicitly to utf8 (#4551)

This commit is contained in:
Sofie Van Landeghem 2019-10-29 13:16:55 +01:00 committed by Ines Montani
parent 9e210fa7fd
commit 33ba9ff464
3 changed files with 7 additions and 7 deletions

View File

@ -84,7 +84,7 @@ def read_conllu(file_):
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None): def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
if text_loc.parts[-1].endswith(".conllu"): if text_loc.parts[-1].endswith(".conllu"):
docs = [] docs = []
with text_loc.open() as file_: with text_loc.open(encoding="utf8") as file_:
for conllu_doc in read_conllu(file_): for conllu_doc in read_conllu(file_):
for conllu_sent in conllu_doc: for conllu_sent in conllu_doc:
words = [line[1] for line in conllu_sent] words = [line[1] for line in conllu_sent]

View File

@ -203,7 +203,7 @@ def golds_to_gold_tuples(docs, golds):
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None): def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
if text_loc.parts[-1].endswith(".conllu"): if text_loc.parts[-1].endswith(".conllu"):
docs = [] docs = []
with text_loc.open() as file_: with text_loc.open(encoding="utf8") as file_:
for conllu_doc in read_conllu(file_): for conllu_doc in read_conllu(file_):
for conllu_sent in conllu_doc: for conllu_sent in conllu_doc:
words = [line[1] for line in conllu_sent] words = [line[1] for line in conllu_sent]
@ -378,7 +378,7 @@ def _load_pretrained_tok2vec(nlp, loc):
"""Load pretrained weights for the 'token-to-vector' part of the component """Load pretrained weights for the 'token-to-vector' part of the component
models, which is typically a CNN. See 'spacy pretrain'. Experimental. models, which is typically a CNN. See 'spacy pretrain'. Experimental.
""" """
with Path(loc).open("rb") as file_: with Path(loc).open("rb", encoding="utf8") as file_:
weights_data = file_.read() weights_data = file_.read()
loaded = [] loaded = []
for name, component in nlp.pipeline: for name, component in nlp.pipeline:
@ -519,8 +519,8 @@ def main(
for i in range(config.nr_epoch): for i in range(config.nr_epoch):
docs, golds = read_data( docs, golds = read_data(
nlp, nlp,
paths.train.conllu.open(), paths.train.conllu.open(encoding="utf8"),
paths.train.text.open(), paths.train.text.open(encoding="utf8"),
max_doc_length=config.max_doc_length, max_doc_length=config.max_doc_length,
limit=limit, limit=limit,
oracle_segments=use_oracle_segments, oracle_segments=use_oracle_segments,
@ -560,7 +560,7 @@ def main(
def _render_parses(i, to_render): def _render_parses(i, to_render):
to_render[0].user_data["title"] = "Batch %d" % i to_render[0].user_data["title"] = "Batch %d" % i
with Path("/tmp/parses.html").open("w") as file_: with Path("/tmp/parses.html").open("w", encoding="utf8") as file_:
html = displacy.render(to_render[:5], style="dep", page=True) html = displacy.render(to_render[:5], style="dep", page=True)
file_.write(html) file_.write(html)

View File

@ -421,7 +421,7 @@ def env_opt(name, default=None):
def read_regex(path): def read_regex(path):
path = ensure_path(path) path = ensure_path(path)
with path.open() as file_: with path.open(encoding="utf8") as file_:
entries = file_.read().split("\n") entries = file_.read().split("\n")
expression = "|".join( expression = "|".join(
["^" + re.escape(piece) for piece in entries if piece.strip()] ["^" + re.escape(piece) for piece in entries if piece.strip()]