def utf8open(loc, mode='r'): return codecs.open(loc, mode, 'utf8') def load_case_stats(data_dir): case_loc = path.join(data_dir, 'english.case') case_stats = {} with utf8open(case_loc) as cases_file: for line in cases_file: word, upper, title = line.split() case_stats[word] = (float(upper), float(title)) return case_stats def load_clitics(data_dir): clitics_loc = path.join(data_dir, 'clitics.txt') entries = [] seen = set() with utf8open(clitics_loc) as clitics_file: for line in clitics_file: line = line.strip() if line.startswith('#'): continue if not line: continue clitics = line.split() word = clitics.pop(0) norm_form = clitics.pop(0) assert word not in seen, word seen.add(word) entries.append((word, norm_form, clitics)) return entries """ def load_browns(self, data_dir): cdef Lexeme* w case_stats = load_case_stats(data_dir) brown_loc = path.join(data_dir, 'bllip-clusters') assert path.exists(brown_loc) cdef size_t start cdef int end with utf8open(brown_loc) as browns_file: for i, line in enumerate(browns_file): cluster_str, word, freq_str = line.split() # Decode as a little-endian string, so that we can do & 15 to get # the first 4 bits. See redshift._parse_features.pyx cluster = int(cluster_str[::-1], 2) upper_pc, title_pc = case_stats.get(word.lower(), (0.0, 0.0)) start = 0 end = -1 find_slice(&start, &end, word) print "Load", repr(word), start, end w = init_word(word, start, end, cluster, upper_pc, title_pc, int(freq_str)) self.words[_hash_str(word)] = w self.strings[w] = word def load_clitics(self, data_dir): cdef unicode orig_str cdef unicode clitic for orig_str, norm_form, clitic_strs in util.load_clitics(data_dir): w = init_clitic(orig_str, self.lookup_slice(norm_form, 0, -1)) self.words[w.orig] = w self.strings[w] = orig_str assert len(clitic_strs) < MAX_CLITICS assert clitic_strs for i, clitic in enumerate(clitic_strs): # If we write punctuation here, assume we want to keep it, # so tell it the slice boundaries (the full string) w.clitics[i] = self.lookup_slice(clitic, 0, -1) # Ensure we null terminate w.clitics[i+1] = 0 """