Merge remote-tracking branch 'refs/remotes/honnibal/master'

2025-07-04 03:43:09 +03:00 · 2015-10-16 11:59:57 +03:00 · 2015-10-16 11:59:57 +03:00 · 6de26d312c
commit 6de26d312c
parent f81fa55db6 17fffb4c57
2 changed files with 16 additions and 29 deletions
--- a/bin/get_freqs.py
+++ b/bin/get_freqs.py
@ -1,6 +1,6 @@
 #!/usr/bin/env python

-from __future__ import unicode_literals
+from __future__ import unicode_literals, print_function

 import plac
 import joblib
@ -12,9 +12,11 @@ import codecs
 from preshed.counter import PreshCounter
 from joblib import Parallel, delayed

-import spacy.en
+from spacy.en import English
 from spacy.strings import StringStore
-from spacy.en.attrs import ORTH
+from spacy.attrs import ORTH
+from spacy.tokenizer import Tokenizer
+from spacy.vocab import Vocab


 def iter_comments(loc):
@ -23,37 +25,22 @@ def iter_comments(loc):
            yield ujson.loads(line)


-def null_props(string):
-    return {
-        'flags': 0,
-        'length': len(string),
-        'orth': string,
-        'lower': string,
-        'norm': string,
-        'shape': string,
-        'prefix': string,
-        'suffix': string,
-        'cluster': 0,
-        'prob': -22,
-        'sentiment': 0
-    }
-
-
 def count_freqs(input_loc, output_loc):
-    print output_loc
-    nlp = spacy.en.English(Parser=None, Tagger=None, Entity=None, load_vectors=False)
-    nlp.vocab.lexeme_props_getter = null_props
+    print(output_loc)
+    vocab = English.default_vocab(get_lex_attr=None)
+    tokenizer = Tokenizer.from_dir(vocab,
+                    path.join(English.default_data_dir(), 'tokenizer'))

    counts = PreshCounter()
-    tokenizer = nlp.tokenizer
    for json_comment in iter_comments(input_loc):
        doc = tokenizer(json_comment['body'])
        doc.count_by(ORTH, counts=counts)

    with codecs.open(output_loc, 'w', 'utf8') as file_:
        for orth, freq in counts:
-            string = nlp.vocab.strings[orth]
-            file_.write('%d\t%s\n' % (freq, repr(string)))
+            string = tokenizer.vocab.strings[orth]
+            if not string.isspace():
+                file_.write('%d\t%s\n' % (freq, string))


 def parallelize(func, iterator, n_jobs):
@ -64,12 +51,12 @@ def merge_counts(locs, out_loc):
    string_map = StringStore()
    counts = PreshCounter()
    for loc in locs:
-        with codecs.open(loc, 'r', 'utf8') as file_:
+        with io.open(loc, 'r', encoding='utf8') as file_:
            for line in file_:
                freq, word = line.strip().split('\t', 1)
                orth = string_map[word]
                counts.inc(orth, int(freq))
-    with codecs.open(out_loc, 'w', 'utf8') as file_:
+    with io.open(out_loc, 'w', encoding='utf8') as file_:
        for orth, count in counts:
            string = string_map[orth]
            file_.write('%d\t%s\n' % (count, string))
@ -98,7 +85,7 @@ def main(input_loc, freqs_dir, output_loc, n_jobs=2, skip_existing=False):
    if tasks:
        parallelize(count_freqs, tasks, n_jobs)

-    print "Merge"
+    print("Merge")
    merge_counts(outputs, output_loc)
                

--- a/setup.py
+++ b/setup.py
@ -179,7 +179,7 @@ VERSION = '0.95'
 def main(modules, is_pypy):
    language = "cpp"
    includes = ['.', path.join(sys.prefix, 'include')]
-    if sys.prefix == 'darwin':
+    if sys.platform.startswith('darwin'):
        compile_options['other'].append(['-mmacosx-version-min=10.8', '-stdlib=libc++'])
        link_opions['other'].append('-lc++')
    if use_cython: