This commit is contained in:
Matthew Honnibal 2017-11-15 13:11:43 +01:00
commit b797dca977
7 changed files with 46 additions and 3 deletions

3
.gitignore vendored
View File

@ -97,3 +97,6 @@ Desktop.ini
# Other # Other
*.tgz *.tgz
# Pycharm project files
*.idea

View File

@ -29,7 +29,7 @@ def main(vectors_loc, lang=None):
nr_row, nr_dim = header.split() nr_row, nr_dim = header.split()
nlp.vocab.reset_vectors(width=int(nr_dim)) nlp.vocab.reset_vectors(width=int(nr_dim))
for line in file_: for line in file_:
line = line.decode('utf8') line = line.rstrip().decode('utf8')
pieces = line.rsplit(' ', nr_dim) pieces = line.rsplit(' ', nr_dim)
word = pieces[0] word = pieces[0]
vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f') vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')

View File

@ -459,6 +459,8 @@ _exc = {
"disorganised": "disorganized", "disorganised": "disorganized",
"distil": "distill", "distil": "distill",
"distils": "distills", "distils": "distills",
"doin": "doing",
"doin'": "doing",
"dramatisation": "dramatization", "dramatisation": "dramatization",
"dramatisations": "dramatizations", "dramatisations": "dramatizations",
"dramatise": "dramatize", "dramatise": "dramatize",
@ -687,6 +689,8 @@ _exc = {
"globalises": "globalizes", "globalises": "globalizes",
"globalising": "globalizing", "globalising": "globalizing",
"glueing ": "gluing ", "glueing ": "gluing ",
"goin": "going",
"goin'":"going",
"goitre": "goiter", "goitre": "goiter",
"goitres": "goiters", "goitres": "goiters",
"gonorrhoea": "gonorrhea", "gonorrhoea": "gonorrhea",
@ -733,6 +737,8 @@ _exc = {
"harmonised": "harmonized", "harmonised": "harmonized",
"harmonises": "harmonizes", "harmonises": "harmonizes",
"harmonising": "harmonizing", "harmonising": "harmonizing",
"havin": "having",
"havin'": "having",
"homoeopath": "homeopath", "homoeopath": "homeopath",
"homoeopathic": "homeopathic", "homoeopathic": "homeopathic",
"homoeopaths": "homeopaths", "homoeopaths": "homeopaths",
@ -924,6 +930,8 @@ _exc = {
"localised": "localized", "localised": "localized",
"localises": "localizes", "localises": "localizes",
"localising": "localizing", "localising": "localizing",
"lovin": "loving",
"lovin'": "loving",
"louvre": "louver", "louvre": "louver",
"louvred": "louvered", "louvred": "louvered",
"louvres": "louvers ", "louvres": "louvers ",

View File

@ -387,6 +387,21 @@ for exc_data in [
{ORTH: "O'clock", LEMMA: "o'clock", NORM: "o'clock"}, {ORTH: "O'clock", LEMMA: "o'clock", NORM: "o'clock"},
{ORTH: "lovin'", LEMMA: "love", NORM: "loving"}, {ORTH: "lovin'", LEMMA: "love", NORM: "loving"},
{ORTH: "Lovin'", LEMMA: "love", NORM: "loving"}, {ORTH: "Lovin'", LEMMA: "love", NORM: "loving"},
{ORTH: "lovin", LEMMA: "love", NORM: "loving"},
{ORTH: "Lovin", LEMMA: "love", NORM: "loving"},
{ORTH: "havin'", LEMMA: "have", NORM: "having"},
{ORTH: "Havin'", LEMMA: "have", NORM: "having"},
{ORTH: "havin", LEMMA: "have", NORM: "having"},
{ORTH: "Havin", LEMMA: "have", NORM: "having"},
{ORTH: "doin'", LEMMA: "do", NORM: "doing"},
{ORTH: "Doin'", LEMMA: "do", NORM: "doing"},
{ORTH: "doin", LEMMA: "do", NORM: "doing"},
{ORTH: "Doin", LEMMA: "do", NORM: "doing"},
{ORTH: "goin'", LEMMA: "go", NORM: "going"},
{ORTH: "Goin'", LEMMA: "go", NORM: "going"},
{ORTH: "goin", LEMMA: "go", NORM: "going"},
{ORTH: "Goin", LEMMA: "go", NORM: "going"},
{ORTH: "Mt.", LEMMA: "Mount", NORM: "Mount"}, {ORTH: "Mt.", LEMMA: "Mount", NORM: "Mount"},
{ORTH: "Ak.", LEMMA: "Alaska", NORM: "Alaska"}, {ORTH: "Ak.", LEMMA: "Alaska", NORM: "Alaska"},

View File

@ -21,8 +21,25 @@ class JapaneseTokenizer(object):
words = [x.surface for x in self.tokenizer.tokenize(text)] words = [x.surface for x in self.tokenizer.tokenize(text)]
return Doc(self.vocab, words=words, spaces=[False]*len(words)) return Doc(self.vocab, words=words, spaces=[False]*len(words))
# add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
# allow serialization (see #1557)
def to_bytes(self, **exclude):
return b''
def from_bytes(self, bytes_data, **exclude):
return self
def to_disk(self, path, **exclude):
return None
def from_disk(self, path, **exclude):
return self
class JapaneseDefaults(Language.Defaults): class JapaneseDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'ja'
@classmethod @classmethod
def create_tokenizer(cls, nlp=None): def create_tokenizer(cls, nlp=None):
return JapaneseTokenizer(cls, nlp) return JapaneseTokenizer(cls, nlp)

View File

@ -257,7 +257,7 @@ p
+row +row
+cell #[code dev_data] +cell #[code dev_data]
+cell positional +cell positional
+cell Location of JSON-formatted dev data (optional). +cell Location of JSON-formatted development data for evaluation.
+row +row
+cell #[code --n-iter], #[code -n] +cell #[code --n-iter], #[code -n]

View File

@ -562,7 +562,7 @@ p
+cell #[code orth_] +cell #[code orth_]
+cell unicode +cell unicode
+cell +cell
| Verbatim text content (identical to #[code Span.text]). Existst | Verbatim text content (identical to #[code Span.text]). Exists
| mostly for consistency with the other attributes. | mostly for consistency with the other attributes.
+row +row