From 4f703f0cb49bbe6aa10b6fa1b527c043cc95005b Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Tue, 20 Oct 2015 19:11:29 +0200 Subject: [PATCH 1/4] better error reporting, cleanup --- spacy/en/download.py | 42 +++++++++++++++++++----------------------- 1 file changed, 19 insertions(+), 23 deletions(-) diff --git a/spacy/en/download.py b/spacy/en/download.py index 6180c4766..8711a390b 100644 --- a/spacy/en/download.py +++ b/spacy/en/download.py @@ -18,42 +18,38 @@ ALL_DATA_DIR_URL = '%s/en_data_all-%s.tgz' % (AWS_STORE, VERSION) DEST_DIR = path.join(path.dirname(path.abspath(__file__)), 'data') -def download_file(url, dest_dir): - return uget.download(url, dest_dir, console=sys.stdout) +def download_file(url, path): + return uget.download(url, path, console=sys.stdout) -def install_data(url, dest_dir): - filename = download_file(url, dest_dir) +def install_data(url, path, filename): + try: + os.makedirs(path) + except FileExistsError: + pass + + filename = download_file(url, os.path.join(path, filename)) t = tarfile.open(filename) - t.extractall(dest_dir) - - -def install_parser_model(url, dest_dir): - filename = download_file(url, dest_dir) - t = tarfile.open(filename, mode=":gz") - t.extractall(dest_dir) - - -def install_dep_vectors(url, dest_dir): - download_file(url, dest_dir) + t.extractall(path) @plac.annotations( force=("Force overwrite", "flag", "f", bool), ) def main(data_size='all', force=False): - if data_size == 'all': - data_url = ALL_DATA_DIR_URL - elif data_size == 'small': - data_url = SM_DATA_DIR_URL - if force and path.exists(DEST_DIR): shutil.rmtree(DEST_DIR) - if not os.path.exists(DEST_DIR): - os.makedirs(DEST_DIR) + filename = ALL_DATA_DIR_URL.rsplit('/', 1)[1] - install_data(data_url, DEST_DIR) + if os.path.exists(DEST_DIR): + # ugly hack to find out whether something other + # than the currently wanted file lives there + if len([f for f in os.listdir(DEST_DIR) if f != filename]): + print('data already installed at %s, overwrite with --force' % DEST_DIR) + sys.exit(1) + + install_data(ALL_DATA_DIR_URL, DEST_DIR, filename) if __name__ == '__main__': From da4c9cee06e6a25470619fa20c1454cc78407d24 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Tue, 20 Oct 2015 19:33:59 +0200 Subject: [PATCH 2/4] assert filename match --- spacy/en/download.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/en/download.py b/spacy/en/download.py index 8711a390b..662948f93 100644 --- a/spacy/en/download.py +++ b/spacy/en/download.py @@ -28,8 +28,10 @@ def install_data(url, path, filename): except FileExistsError: pass - filename = download_file(url, os.path.join(path, filename)) - t = tarfile.open(filename) + download_path = os.path.join(path, filename) + tmp = download_file(url, download_path) + assert tmp == download_path + t = tarfile.open(download_path) t.extractall(path) From ccffd2ef53bad9d2a8e51c072948dcc08279ba86 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Wed, 21 Oct 2015 07:59:34 +0200 Subject: [PATCH 3/4] fixed extract directory --- spacy/en/download.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/spacy/en/download.py b/spacy/en/download.py index 662948f93..11ed96caa 100644 --- a/spacy/en/download.py +++ b/spacy/en/download.py @@ -1,5 +1,4 @@ from __future__ import print_function -from os import path import sys import os import tarfile @@ -15,43 +14,44 @@ AWS_STORE = 'https://s3-us-west-1.amazonaws.com/media.spacynlp.com' ALL_DATA_DIR_URL = '%s/en_data_all-%s.tgz' % (AWS_STORE, VERSION) -DEST_DIR = path.join(path.dirname(path.abspath(__file__)), 'data') +DEST_DIR = os.path.dirname(os.path.abspath(__file__)) -def download_file(url, path): - return uget.download(url, path, console=sys.stdout) +def download_file(url, download_path): + return uget.download(url, download_path, console=sys.stdout) -def install_data(url, path, filename): +def install_data(url, extract_path, download_path): try: - os.makedirs(path) + os.makedirs(extract_path) except FileExistsError: pass - download_path = os.path.join(path, filename) tmp = download_file(url, download_path) assert tmp == download_path t = tarfile.open(download_path) - t.extractall(path) + t.extractall(extract_path) @plac.annotations( force=("Force overwrite", "flag", "f", bool), ) def main(data_size='all', force=False): - if force and path.exists(DEST_DIR): - shutil.rmtree(DEST_DIR) - filename = ALL_DATA_DIR_URL.rsplit('/', 1)[1] + download_path = os.path.join(DEST_DIR, filename) + data_path = os.path.join(DEST_DIR, 'data') - if os.path.exists(DEST_DIR): - # ugly hack to find out whether something other - # than the currently wanted file lives there - if len([f for f in os.listdir(DEST_DIR) if f != filename]): - print('data already installed at %s, overwrite with --force' % DEST_DIR) - sys.exit(1) + if force and os.path.exists(download_path): + os.unlink(download_path) - install_data(ALL_DATA_DIR_URL, DEST_DIR, filename) + if force and os.path.exists(data_path): + shutil.rmtree(data_path) + + if os.path.exists(data_path): + print('data already installed at %s, overwrite with --force' % DEST_DIR) + sys.exit(1) + + install_data(ALL_DATA_DIR_URL, DEST_DIR, download_path) if __name__ == '__main__': From f18fd8c6592c13de154cec6942f948a55b23a4e3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 23 Oct 2015 03:48:12 +1100 Subject: [PATCH 4/4] * Fix language.py for change in StringStore load API --- spacy/language.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index d09f25b84..691b3e97e 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1,5 +1,6 @@ from os import path from warnings import warn +import io try: import ujson as json @@ -248,8 +249,8 @@ class Language(object): self.entity.model.end_training(path.join(data_dir, 'ner', 'model')) self.tagger.model.end_training(path.join(data_dir, 'pos', 'model')) - strings_loc = path.join(data_dir, 'vocab', 'strings.txt') - with io.open(strings_loc, 'w', encoding='utf8'): + strings_loc = path.join(data_dir, 'vocab', 'strings.json') + with io.open(strings_loc, 'w', encoding='utf8') as file_: self.vocab.strings.dump(file_) with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_: