Merge branch 'alvations-master'

This commit is contained in:
Matthew Honnibal 2015-10-10 14:13:24 +11:00
commit 6ea8f99a10
9 changed files with 35 additions and 15 deletions

View File

@ -27,8 +27,8 @@ from pathlib import Path
from shutil import copyfile from shutil import copyfile
from shutil import copytree from shutil import copytree
import codecs
from collections import defaultdict from collections import defaultdict
import io
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.vocab import write_binary_vectors from spacy.vocab import write_binary_vectors
@ -61,7 +61,7 @@ def _read_clusters(loc):
print("Warning: Clusters file not found") print("Warning: Clusters file not found")
return {} return {}
clusters = {} clusters = {}
for line in codecs.open(str(loc), 'r', 'utf8'): for line in io.open(str(loc), 'r', encoding='utf8'):
try: try:
cluster, word, freq = line.split() cluster, word, freq = line.split()
except ValueError: except ValueError:
@ -88,7 +88,7 @@ def _read_probs(loc):
print("Probabilities file not found. Trying freqs.") print("Probabilities file not found. Trying freqs.")
return {}, 0.0 return {}, 0.0
probs = {} probs = {}
for i, line in enumerate(codecs.open(str(loc), 'r', 'utf8')): for i, line in enumerate(io.open(str(loc), 'r', encoding='utf8')):
prob, word = line.split() prob, word = line.split()
prob = float(prob) prob = float(prob)
probs[word] = prob probs[word] = prob

View File

@ -1,11 +1,11 @@
import codecs import io
import plac import plac
from spacy.en import English from spacy.en import English
def main(text_loc): def main(text_loc):
with codecs.open(text_loc, 'r', 'utf8') as file_: with io.open(text_loc, 'r', encoding='utf8') as file_:
text = file_.read() text = file_.read()
NLU = English() NLU = English()
for paragraph in text.split('\n\n'): for paragraph in text.split('\n\n'):

View File

@ -6,7 +6,7 @@ from __future__ import print_function
import os import os
from os import path from os import path
import shutil import shutil
import codecs import io
import random import random
import plac import plac
@ -177,7 +177,7 @@ def write_parses(Language, dev_loc, model_dir, out_loc):
nlp = Language(data_dir=model_dir) nlp = Language(data_dir=model_dir)
gold_tuples = read_json_file(dev_loc) gold_tuples = read_json_file(dev_loc)
scorer = Scorer() scorer = Scorer()
out_file = codecs.open(out_loc, 'w', 'utf8') out_file = io.open(out_loc, 'w', 'utf8')
for raw_text, sents in gold_tuples: for raw_text, sents in gold_tuples:
sents = _merge_sents(sents) sents = _merge_sents(sents)
for annot_tuples, brackets in sents: for annot_tuples, brackets in sents:

View File

@ -27,7 +27,7 @@ import json
from os import path from os import path
import os import os
import re import re
import codecs import io
from collections import defaultdict from collections import defaultdict
from spacy.munge import read_ptb from spacy.munge import read_ptb
@ -122,7 +122,7 @@ def read_file(*pieces):
if not path.exists(loc): if not path.exists(loc):
return None return None
else: else:
return codecs.open(loc, 'r', 'utf8').read().strip() return io.open(loc, 'r', encoding='utf8').read().strip()
def get_file_names(section_dir, subsection): def get_file_names(section_dir, subsection):

View File

@ -1,5 +1,7 @@
import numpy import numpy
import codecs import io
import json
import ujson
import random import random
import re import re
import os import os

View File

@ -1,5 +1,9 @@
<<<<<<< HEAD
from __future__ import unicode_literals from __future__ import unicode_literals
import codecs import codecs
=======
import io
>>>>>>> 8caedba42a5255b9996533a732e17eee3f20a2dd
from libc.string cimport memcpy from libc.string cimport memcpy
from murmurhash.mrmr cimport hash64 from murmurhash.mrmr cimport hash64
@ -129,6 +133,7 @@ cdef class StringStore:
def dump(self, loc): def dump(self, loc):
cdef Utf8Str* string cdef Utf8Str* string
<<<<<<< HEAD
cdef unicode py_string cdef unicode py_string
cdef int i cdef int i
with codecs.open(loc, 'w', 'utf8') as file_: with codecs.open(loc, 'w', 'utf8') as file_:
@ -138,9 +143,18 @@ cdef class StringStore:
file_.write(py_string) file_.write(py_string)
if (i+1) != self.size: if (i+1) != self.size:
file_.write(SEPARATOR) file_.write(SEPARATOR)
=======
cdef bytes py_string
for i in range(self.size):
string = &self.strings[i]
py_string = string.chars[:string.length]
strings.append(py_string.decode('utf8'))
with io.open(loc, 'w', encoding='utf8') as file_:
file_.write(SEPARATOR.join(strings))
>>>>>>> 8caedba42a5255b9996533a732e17eee3f20a2dd
def load(self, loc): def load(self, loc):
with codecs.open(loc, 'r', 'utf8') as file_: with io.open(loc, 'r', encoding='utf8') as file_:
strings = file_.read().split(SEPARATOR) strings = file_.read().split(SEPARATOR)
if strings == ['']: if strings == ['']:
return None return None

View File

@ -1,5 +1,5 @@
from os import path from os import path
import codecs import io
import json import json
import re import re
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
@ -8,7 +8,7 @@ DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
def utf8open(loc, mode='r'): def utf8open(loc, mode='r'):
return codecs.open(loc, mode, 'utf8') return io.open(loc, mode, encoding='utf8')
def read_lang_data(data_dir): def read_lang_data(data_dir):

View File

@ -7,7 +7,7 @@ from libc.stdint cimport uint64_t
import bz2 import bz2
from os import path from os import path
import codecs import io
import math import math
import json import json

View File

@ -1,13 +1,17 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from os import path from os import path
import codecs import io
import pytest import pytest
@pytest.fixture @pytest.fixture
def sun_text(): def sun_text():
<<<<<<< HEAD:tests/parser/test_parse_navigate.py
with codecs.open(path.join(path.dirname(__file__), '..', 'sun.txt'), 'r', 'utf8') as file_: with codecs.open(path.join(path.dirname(__file__), '..', 'sun.txt'), 'r', 'utf8') as file_:
=======
with io.open(path.join(path.dirname(__file__), 'sun.txt'), 'r', encoding='utf8') as file_:
>>>>>>> 8caedba42a5255b9996533a732e17eee3f20a2dd:tests/test_parse_navigate.py
text = file_.read() text = file_.read()
return text return text