mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-12 01:02:23 +03:00
* Update get_freqs.py script
This commit is contained in:
parent
a748146dd3
commit
5ff4454177
|
@ -25,25 +25,10 @@ def iter_comments(loc):
|
||||||
yield ujson.loads(line)
|
yield ujson.loads(line)
|
||||||
|
|
||||||
|
|
||||||
def null_props(string):
|
|
||||||
return {
|
|
||||||
'flags': 0,
|
|
||||||
'length': len(string),
|
|
||||||
'orth': string,
|
|
||||||
'lower': string,
|
|
||||||
'norm': string,
|
|
||||||
'shape': string,
|
|
||||||
'prefix': string,
|
|
||||||
'suffix': string,
|
|
||||||
'cluster': 0,
|
|
||||||
'prob': -22,
|
|
||||||
'sentiment': 0
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def count_freqs(input_loc, output_loc):
|
def count_freqs(input_loc, output_loc):
|
||||||
print(output_loc)
|
print(output_loc)
|
||||||
tokenizer = Tokenizer.from_dir(Vocab(), spacy.en.English.default_data_dir())
|
tokenizer = Tokenizer.from_dir(Vocab(),
|
||||||
|
path.join(spacy.en.English.default_data_dir(), 'tokenizer'))
|
||||||
|
|
||||||
counts = PreshCounter()
|
counts = PreshCounter()
|
||||||
for json_comment in iter_comments(input_loc):
|
for json_comment in iter_comments(input_loc):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user