Merge branch 'develop' of https://github.com/honnibal/spaCy into develop

2025-12-06 17:54:21 +03:00 · 2015-10-22 18:48:44 +02:00 · 2015-10-22 18:48:44 +02:00 · 46fbf29019
commit 46fbf29019
parent 4f5b4a88f2 f18fd8c659
2 changed files with 27 additions and 28 deletions
--- a/spacy/en/download.py
+++ b/spacy/en/download.py
@ -1,5 +1,4 @@
 from __future__ import print_function
 from os import path
 import sys
 import os
 import tarfile
@ -15,45 +14,44 @@ AWS_STORE = 'https://s3-us-west-1.amazonaws.com/media.spacynlp.com'
 ALL_DATA_DIR_URL = '%s/en_data_all-%s.tgz' % (AWS_STORE, VERSION)
-DEST_DIR = path.join(path.dirname(path.abspath(__file__)), 'data')
+DEST_DIR = os.path.dirname(os.path.abspath(__file__))
-def download_file(url, dest_dir):
+def download_file(url, download_path):
-    return uget.download(url, dest_dir, console=sys.stdout)
+    return uget.download(url, download_path, console=sys.stdout)
-def install_data(url, dest_dir):
+def install_data(url, extract_path, download_path):
-    filename = download_file(url, dest_dir)
+    try:
-    t = tarfile.open(filename)
+        os.makedirs(extract_path)
-    t.extractall(dest_dir)
+    except FileExistsError:
        pass
-
+    tmp = download_file(url, download_path)
-def install_parser_model(url, dest_dir):
+    assert tmp == download_path
-    filename = download_file(url, dest_dir)
+    t = tarfile.open(download_path)
-    t = tarfile.open(filename, mode=":gz")
+    t.extractall(extract_path)
    t.extractall(dest_dir)
 def install_dep_vectors(url, dest_dir):
    download_file(url, dest_dir)
@plac.annotations(
    force=("Force overwrite", "flag", "f", bool),
 )
 def main(data_size='all', force=False):
-    if data_size == 'all':
+    filename = ALL_DATA_DIR_URL.rsplit('/', 1)[1]
-        data_url = ALL_DATA_DIR_URL
+    download_path = os.path.join(DEST_DIR, filename)
-    elif data_size == 'small':
+    data_path = os.path.join(DEST_DIR, 'data')
        data_url = SM_DATA_DIR_URL
-    if force and path.exists(DEST_DIR):
+    if force and os.path.exists(download_path):
-        shutil.rmtree(DEST_DIR)
+        os.unlink(download_path)
-    if not os.path.exists(DEST_DIR):
+    if force and os.path.exists(data_path):
-        os.makedirs(DEST_DIR)
+        shutil.rmtree(data_path)
-    install_data(data_url, DEST_DIR)
+    if os.path.exists(data_path):
        print('data already installed at %s, overwrite with --force' % DEST_DIR)
        sys.exit(1)
    install_data(ALL_DATA_DIR_URL, DEST_DIR, download_path)
 if __name__ == '__main__':
--- a/spacy/language.py
+++ b/spacy/language.py
@ -1,5 +1,6 @@
 from os import path
 from warnings import warn
 import io
 try:
    import ujson as json
@ -248,8 +249,8 @@ class Language(object):
        self.entity.model.end_training(path.join(data_dir, 'ner', 'model'))
        self.tagger.model.end_training(path.join(data_dir, 'pos', 'model'))
-        strings_loc = path.join(data_dir, 'vocab', 'strings.txt')
+        strings_loc = path.join(data_dir, 'vocab', 'strings.json')
-        with io.open(strings_loc, 'w', encoding='utf8'):
+        with io.open(strings_loc, 'w', encoding='utf8') as file_:
            self.vocab.strings.dump(file_)
        with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_: