mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-24 23:50:45 +03:00
Merge remote-tracking branch 'refs/remotes/honnibal/master'
This commit is contained in:
commit
6de26d312c
|
@ -1,6 +1,6 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import joblib
|
import joblib
|
||||||
|
@ -12,9 +12,11 @@ import codecs
|
||||||
from preshed.counter import PreshCounter
|
from preshed.counter import PreshCounter
|
||||||
from joblib import Parallel, delayed
|
from joblib import Parallel, delayed
|
||||||
|
|
||||||
import spacy.en
|
from spacy.en import English
|
||||||
from spacy.strings import StringStore
|
from spacy.strings import StringStore
|
||||||
from spacy.en.attrs import ORTH
|
from spacy.attrs import ORTH
|
||||||
|
from spacy.tokenizer import Tokenizer
|
||||||
|
from spacy.vocab import Vocab
|
||||||
|
|
||||||
|
|
||||||
def iter_comments(loc):
|
def iter_comments(loc):
|
||||||
|
@ -23,37 +25,22 @@ def iter_comments(loc):
|
||||||
yield ujson.loads(line)
|
yield ujson.loads(line)
|
||||||
|
|
||||||
|
|
||||||
def null_props(string):
|
|
||||||
return {
|
|
||||||
'flags': 0,
|
|
||||||
'length': len(string),
|
|
||||||
'orth': string,
|
|
||||||
'lower': string,
|
|
||||||
'norm': string,
|
|
||||||
'shape': string,
|
|
||||||
'prefix': string,
|
|
||||||
'suffix': string,
|
|
||||||
'cluster': 0,
|
|
||||||
'prob': -22,
|
|
||||||
'sentiment': 0
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def count_freqs(input_loc, output_loc):
|
def count_freqs(input_loc, output_loc):
|
||||||
print output_loc
|
print(output_loc)
|
||||||
nlp = spacy.en.English(Parser=None, Tagger=None, Entity=None, load_vectors=False)
|
vocab = English.default_vocab(get_lex_attr=None)
|
||||||
nlp.vocab.lexeme_props_getter = null_props
|
tokenizer = Tokenizer.from_dir(vocab,
|
||||||
|
path.join(English.default_data_dir(), 'tokenizer'))
|
||||||
|
|
||||||
counts = PreshCounter()
|
counts = PreshCounter()
|
||||||
tokenizer = nlp.tokenizer
|
|
||||||
for json_comment in iter_comments(input_loc):
|
for json_comment in iter_comments(input_loc):
|
||||||
doc = tokenizer(json_comment['body'])
|
doc = tokenizer(json_comment['body'])
|
||||||
doc.count_by(ORTH, counts=counts)
|
doc.count_by(ORTH, counts=counts)
|
||||||
|
|
||||||
with codecs.open(output_loc, 'w', 'utf8') as file_:
|
with codecs.open(output_loc, 'w', 'utf8') as file_:
|
||||||
for orth, freq in counts:
|
for orth, freq in counts:
|
||||||
string = nlp.vocab.strings[orth]
|
string = tokenizer.vocab.strings[orth]
|
||||||
file_.write('%d\t%s\n' % (freq, repr(string)))
|
if not string.isspace():
|
||||||
|
file_.write('%d\t%s\n' % (freq, string))
|
||||||
|
|
||||||
|
|
||||||
def parallelize(func, iterator, n_jobs):
|
def parallelize(func, iterator, n_jobs):
|
||||||
|
@ -64,12 +51,12 @@ def merge_counts(locs, out_loc):
|
||||||
string_map = StringStore()
|
string_map = StringStore()
|
||||||
counts = PreshCounter()
|
counts = PreshCounter()
|
||||||
for loc in locs:
|
for loc in locs:
|
||||||
with codecs.open(loc, 'r', 'utf8') as file_:
|
with io.open(loc, 'r', encoding='utf8') as file_:
|
||||||
for line in file_:
|
for line in file_:
|
||||||
freq, word = line.strip().split('\t', 1)
|
freq, word = line.strip().split('\t', 1)
|
||||||
orth = string_map[word]
|
orth = string_map[word]
|
||||||
counts.inc(orth, int(freq))
|
counts.inc(orth, int(freq))
|
||||||
with codecs.open(out_loc, 'w', 'utf8') as file_:
|
with io.open(out_loc, 'w', encoding='utf8') as file_:
|
||||||
for orth, count in counts:
|
for orth, count in counts:
|
||||||
string = string_map[orth]
|
string = string_map[orth]
|
||||||
file_.write('%d\t%s\n' % (count, string))
|
file_.write('%d\t%s\n' % (count, string))
|
||||||
|
@ -98,7 +85,7 @@ def main(input_loc, freqs_dir, output_loc, n_jobs=2, skip_existing=False):
|
||||||
if tasks:
|
if tasks:
|
||||||
parallelize(count_freqs, tasks, n_jobs)
|
parallelize(count_freqs, tasks, n_jobs)
|
||||||
|
|
||||||
print "Merge"
|
print("Merge")
|
||||||
merge_counts(outputs, output_loc)
|
merge_counts(outputs, output_loc)
|
||||||
|
|
||||||
|
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -179,7 +179,7 @@ VERSION = '0.95'
|
||||||
def main(modules, is_pypy):
|
def main(modules, is_pypy):
|
||||||
language = "cpp"
|
language = "cpp"
|
||||||
includes = ['.', path.join(sys.prefix, 'include')]
|
includes = ['.', path.join(sys.prefix, 'include')]
|
||||||
if sys.prefix == 'darwin':
|
if sys.platform.startswith('darwin'):
|
||||||
compile_options['other'].append(['-mmacosx-version-min=10.8', '-stdlib=libc++'])
|
compile_options['other'].append(['-mmacosx-version-min=10.8', '-stdlib=libc++'])
|
||||||
link_opions['other'].append('-lc++')
|
link_opions['other'].append('-lc++')
|
||||||
if use_cython:
|
if use_cython:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user