mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 18:56:36 +03:00
Merge branch 'master' of https://github.com/explosion/spaCy
This commit is contained in:
commit
b797dca977
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -97,3 +97,6 @@ Desktop.ini
|
||||||
|
|
||||||
# Other
|
# Other
|
||||||
*.tgz
|
*.tgz
|
||||||
|
|
||||||
|
# Pycharm project files
|
||||||
|
*.idea
|
||||||
|
|
|
@ -29,7 +29,7 @@ def main(vectors_loc, lang=None):
|
||||||
nr_row, nr_dim = header.split()
|
nr_row, nr_dim = header.split()
|
||||||
nlp.vocab.reset_vectors(width=int(nr_dim))
|
nlp.vocab.reset_vectors(width=int(nr_dim))
|
||||||
for line in file_:
|
for line in file_:
|
||||||
line = line.decode('utf8')
|
line = line.rstrip().decode('utf8')
|
||||||
pieces = line.rsplit(' ', nr_dim)
|
pieces = line.rsplit(' ', nr_dim)
|
||||||
word = pieces[0]
|
word = pieces[0]
|
||||||
vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
|
vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
|
||||||
|
|
|
@ -459,6 +459,8 @@ _exc = {
|
||||||
"disorganised": "disorganized",
|
"disorganised": "disorganized",
|
||||||
"distil": "distill",
|
"distil": "distill",
|
||||||
"distils": "distills",
|
"distils": "distills",
|
||||||
|
"doin": "doing",
|
||||||
|
"doin'": "doing",
|
||||||
"dramatisation": "dramatization",
|
"dramatisation": "dramatization",
|
||||||
"dramatisations": "dramatizations",
|
"dramatisations": "dramatizations",
|
||||||
"dramatise": "dramatize",
|
"dramatise": "dramatize",
|
||||||
|
@ -687,6 +689,8 @@ _exc = {
|
||||||
"globalises": "globalizes",
|
"globalises": "globalizes",
|
||||||
"globalising": "globalizing",
|
"globalising": "globalizing",
|
||||||
"glueing ": "gluing ",
|
"glueing ": "gluing ",
|
||||||
|
"goin": "going",
|
||||||
|
"goin'":"going",
|
||||||
"goitre": "goiter",
|
"goitre": "goiter",
|
||||||
"goitres": "goiters",
|
"goitres": "goiters",
|
||||||
"gonorrhoea": "gonorrhea",
|
"gonorrhoea": "gonorrhea",
|
||||||
|
@ -733,6 +737,8 @@ _exc = {
|
||||||
"harmonised": "harmonized",
|
"harmonised": "harmonized",
|
||||||
"harmonises": "harmonizes",
|
"harmonises": "harmonizes",
|
||||||
"harmonising": "harmonizing",
|
"harmonising": "harmonizing",
|
||||||
|
"havin": "having",
|
||||||
|
"havin'": "having",
|
||||||
"homoeopath": "homeopath",
|
"homoeopath": "homeopath",
|
||||||
"homoeopathic": "homeopathic",
|
"homoeopathic": "homeopathic",
|
||||||
"homoeopaths": "homeopaths",
|
"homoeopaths": "homeopaths",
|
||||||
|
@ -924,6 +930,8 @@ _exc = {
|
||||||
"localised": "localized",
|
"localised": "localized",
|
||||||
"localises": "localizes",
|
"localises": "localizes",
|
||||||
"localising": "localizing",
|
"localising": "localizing",
|
||||||
|
"lovin": "loving",
|
||||||
|
"lovin'": "loving",
|
||||||
"louvre": "louver",
|
"louvre": "louver",
|
||||||
"louvred": "louvered",
|
"louvred": "louvered",
|
||||||
"louvres": "louvers ",
|
"louvres": "louvers ",
|
||||||
|
|
|
@ -387,6 +387,21 @@ for exc_data in [
|
||||||
{ORTH: "O'clock", LEMMA: "o'clock", NORM: "o'clock"},
|
{ORTH: "O'clock", LEMMA: "o'clock", NORM: "o'clock"},
|
||||||
{ORTH: "lovin'", LEMMA: "love", NORM: "loving"},
|
{ORTH: "lovin'", LEMMA: "love", NORM: "loving"},
|
||||||
{ORTH: "Lovin'", LEMMA: "love", NORM: "loving"},
|
{ORTH: "Lovin'", LEMMA: "love", NORM: "loving"},
|
||||||
|
{ORTH: "lovin", LEMMA: "love", NORM: "loving"},
|
||||||
|
{ORTH: "Lovin", LEMMA: "love", NORM: "loving"},
|
||||||
|
{ORTH: "havin'", LEMMA: "have", NORM: "having"},
|
||||||
|
{ORTH: "Havin'", LEMMA: "have", NORM: "having"},
|
||||||
|
{ORTH: "havin", LEMMA: "have", NORM: "having"},
|
||||||
|
{ORTH: "Havin", LEMMA: "have", NORM: "having"},
|
||||||
|
{ORTH: "doin'", LEMMA: "do", NORM: "doing"},
|
||||||
|
{ORTH: "Doin'", LEMMA: "do", NORM: "doing"},
|
||||||
|
{ORTH: "doin", LEMMA: "do", NORM: "doing"},
|
||||||
|
{ORTH: "Doin", LEMMA: "do", NORM: "doing"},
|
||||||
|
{ORTH: "goin'", LEMMA: "go", NORM: "going"},
|
||||||
|
{ORTH: "Goin'", LEMMA: "go", NORM: "going"},
|
||||||
|
{ORTH: "goin", LEMMA: "go", NORM: "going"},
|
||||||
|
{ORTH: "Goin", LEMMA: "go", NORM: "going"},
|
||||||
|
|
||||||
|
|
||||||
{ORTH: "Mt.", LEMMA: "Mount", NORM: "Mount"},
|
{ORTH: "Mt.", LEMMA: "Mount", NORM: "Mount"},
|
||||||
{ORTH: "Ak.", LEMMA: "Alaska", NORM: "Alaska"},
|
{ORTH: "Ak.", LEMMA: "Alaska", NORM: "Alaska"},
|
||||||
|
|
|
@ -21,8 +21,25 @@ class JapaneseTokenizer(object):
|
||||||
words = [x.surface for x in self.tokenizer.tokenize(text)]
|
words = [x.surface for x in self.tokenizer.tokenize(text)]
|
||||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||||
|
|
||||||
|
# add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
|
||||||
|
# allow serialization (see #1557)
|
||||||
|
def to_bytes(self, **exclude):
|
||||||
|
return b''
|
||||||
|
|
||||||
|
def from_bytes(self, bytes_data, **exclude):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def to_disk(self, path, **exclude):
|
||||||
|
return None
|
||||||
|
|
||||||
|
def from_disk(self, path, **exclude):
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
class JapaneseDefaults(Language.Defaults):
|
class JapaneseDefaults(Language.Defaults):
|
||||||
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
|
lex_attr_getters[LANG] = lambda text: 'ja'
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_tokenizer(cls, nlp=None):
|
def create_tokenizer(cls, nlp=None):
|
||||||
return JapaneseTokenizer(cls, nlp)
|
return JapaneseTokenizer(cls, nlp)
|
||||||
|
|
|
@ -257,7 +257,7 @@ p
|
||||||
+row
|
+row
|
||||||
+cell #[code dev_data]
|
+cell #[code dev_data]
|
||||||
+cell positional
|
+cell positional
|
||||||
+cell Location of JSON-formatted dev data (optional).
|
+cell Location of JSON-formatted development data for evaluation.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code --n-iter], #[code -n]
|
+cell #[code --n-iter], #[code -n]
|
||||||
|
|
|
@ -562,7 +562,7 @@ p
|
||||||
+cell #[code orth_]
|
+cell #[code orth_]
|
||||||
+cell unicode
|
+cell unicode
|
||||||
+cell
|
+cell
|
||||||
| Verbatim text content (identical to #[code Span.text]). Existst
|
| Verbatim text content (identical to #[code Span.text]). Exists
|
||||||
| mostly for consistency with the other attributes.
|
| mostly for consistency with the other attributes.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
|
|
Loading…
Reference in New Issue
Block a user