mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Merge remote-tracking branch 'refs/remotes/honnibal/master'
This commit is contained in:
		
						commit
						6de26d312c
					
				| 
						 | 
					@ -1,6 +1,6 @@
 | 
				
			||||||
#!/usr/bin/env python
 | 
					#!/usr/bin/env python
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from __future__ import unicode_literals
 | 
					from __future__ import unicode_literals, print_function
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import plac
 | 
					import plac
 | 
				
			||||||
import joblib
 | 
					import joblib
 | 
				
			||||||
| 
						 | 
					@ -12,9 +12,11 @@ import codecs
 | 
				
			||||||
from preshed.counter import PreshCounter
 | 
					from preshed.counter import PreshCounter
 | 
				
			||||||
from joblib import Parallel, delayed
 | 
					from joblib import Parallel, delayed
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import spacy.en
 | 
					from spacy.en import English
 | 
				
			||||||
from spacy.strings import StringStore
 | 
					from spacy.strings import StringStore
 | 
				
			||||||
from spacy.en.attrs import ORTH
 | 
					from spacy.attrs import ORTH
 | 
				
			||||||
 | 
					from spacy.tokenizer import Tokenizer
 | 
				
			||||||
 | 
					from spacy.vocab import Vocab
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def iter_comments(loc):
 | 
					def iter_comments(loc):
 | 
				
			||||||
| 
						 | 
					@ -23,37 +25,22 @@ def iter_comments(loc):
 | 
				
			||||||
            yield ujson.loads(line)
 | 
					            yield ujson.loads(line)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def null_props(string):
 | 
					 | 
				
			||||||
    return {
 | 
					 | 
				
			||||||
        'flags': 0,
 | 
					 | 
				
			||||||
        'length': len(string),
 | 
					 | 
				
			||||||
        'orth': string,
 | 
					 | 
				
			||||||
        'lower': string,
 | 
					 | 
				
			||||||
        'norm': string,
 | 
					 | 
				
			||||||
        'shape': string,
 | 
					 | 
				
			||||||
        'prefix': string,
 | 
					 | 
				
			||||||
        'suffix': string,
 | 
					 | 
				
			||||||
        'cluster': 0,
 | 
					 | 
				
			||||||
        'prob': -22,
 | 
					 | 
				
			||||||
        'sentiment': 0
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def count_freqs(input_loc, output_loc):
 | 
					def count_freqs(input_loc, output_loc):
 | 
				
			||||||
    print output_loc
 | 
					    print(output_loc)
 | 
				
			||||||
    nlp = spacy.en.English(Parser=None, Tagger=None, Entity=None, load_vectors=False)
 | 
					    vocab = English.default_vocab(get_lex_attr=None)
 | 
				
			||||||
    nlp.vocab.lexeme_props_getter = null_props
 | 
					    tokenizer = Tokenizer.from_dir(vocab,
 | 
				
			||||||
 | 
					                    path.join(English.default_data_dir(), 'tokenizer'))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    counts = PreshCounter()
 | 
					    counts = PreshCounter()
 | 
				
			||||||
    tokenizer = nlp.tokenizer
 | 
					 | 
				
			||||||
    for json_comment in iter_comments(input_loc):
 | 
					    for json_comment in iter_comments(input_loc):
 | 
				
			||||||
        doc = tokenizer(json_comment['body'])
 | 
					        doc = tokenizer(json_comment['body'])
 | 
				
			||||||
        doc.count_by(ORTH, counts=counts)
 | 
					        doc.count_by(ORTH, counts=counts)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    with codecs.open(output_loc, 'w', 'utf8') as file_:
 | 
					    with codecs.open(output_loc, 'w', 'utf8') as file_:
 | 
				
			||||||
        for orth, freq in counts:
 | 
					        for orth, freq in counts:
 | 
				
			||||||
            string = nlp.vocab.strings[orth]
 | 
					            string = tokenizer.vocab.strings[orth]
 | 
				
			||||||
            file_.write('%d\t%s\n' % (freq, repr(string)))
 | 
					            if not string.isspace():
 | 
				
			||||||
 | 
					                file_.write('%d\t%s\n' % (freq, string))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def parallelize(func, iterator, n_jobs):
 | 
					def parallelize(func, iterator, n_jobs):
 | 
				
			||||||
| 
						 | 
					@ -64,12 +51,12 @@ def merge_counts(locs, out_loc):
 | 
				
			||||||
    string_map = StringStore()
 | 
					    string_map = StringStore()
 | 
				
			||||||
    counts = PreshCounter()
 | 
					    counts = PreshCounter()
 | 
				
			||||||
    for loc in locs:
 | 
					    for loc in locs:
 | 
				
			||||||
        with codecs.open(loc, 'r', 'utf8') as file_:
 | 
					        with io.open(loc, 'r', encoding='utf8') as file_:
 | 
				
			||||||
            for line in file_:
 | 
					            for line in file_:
 | 
				
			||||||
                freq, word = line.strip().split('\t', 1)
 | 
					                freq, word = line.strip().split('\t', 1)
 | 
				
			||||||
                orth = string_map[word]
 | 
					                orth = string_map[word]
 | 
				
			||||||
                counts.inc(orth, int(freq))
 | 
					                counts.inc(orth, int(freq))
 | 
				
			||||||
    with codecs.open(out_loc, 'w', 'utf8') as file_:
 | 
					    with io.open(out_loc, 'w', encoding='utf8') as file_:
 | 
				
			||||||
        for orth, count in counts:
 | 
					        for orth, count in counts:
 | 
				
			||||||
            string = string_map[orth]
 | 
					            string = string_map[orth]
 | 
				
			||||||
            file_.write('%d\t%s\n' % (count, string))
 | 
					            file_.write('%d\t%s\n' % (count, string))
 | 
				
			||||||
| 
						 | 
					@ -98,7 +85,7 @@ def main(input_loc, freqs_dir, output_loc, n_jobs=2, skip_existing=False):
 | 
				
			||||||
    if tasks:
 | 
					    if tasks:
 | 
				
			||||||
        parallelize(count_freqs, tasks, n_jobs)
 | 
					        parallelize(count_freqs, tasks, n_jobs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    print "Merge"
 | 
					    print("Merge")
 | 
				
			||||||
    merge_counts(outputs, output_loc)
 | 
					    merge_counts(outputs, output_loc)
 | 
				
			||||||
                
 | 
					                
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										2
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								setup.py
									
									
									
									
									
								
							| 
						 | 
					@ -179,7 +179,7 @@ VERSION = '0.95'
 | 
				
			||||||
def main(modules, is_pypy):
 | 
					def main(modules, is_pypy):
 | 
				
			||||||
    language = "cpp"
 | 
					    language = "cpp"
 | 
				
			||||||
    includes = ['.', path.join(sys.prefix, 'include')]
 | 
					    includes = ['.', path.join(sys.prefix, 'include')]
 | 
				
			||||||
    if sys.prefix == 'darwin':
 | 
					    if sys.platform.startswith('darwin'):
 | 
				
			||||||
        compile_options['other'].append(['-mmacosx-version-min=10.8', '-stdlib=libc++'])
 | 
					        compile_options['other'].append(['-mmacosx-version-min=10.8', '-stdlib=libc++'])
 | 
				
			||||||
        link_opions['other'].append('-lc++')
 | 
					        link_opions['other'].append('-lc++')
 | 
				
			||||||
    if use_cython:
 | 
					    if use_cython:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user