From 916de3c2155fdbddfeeb410b7a51e013c2e464be Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 24 Sep 2015 19:24:23 +1000 Subject: [PATCH 1/3] * Write updated load-new-word-vectors documentation --- .../load-new-word-vectors/index.jade | 72 ++++++++++++------- 1 file changed, 46 insertions(+), 26 deletions(-) diff --git a/website/src/jade/tutorials/load-new-word-vectors/index.jade b/website/src/jade/tutorials/load-new-word-vectors/index.jade index 856193041..3a592634f 100644 --- a/website/src/jade/tutorials/load-new-word-vectors/index.jade +++ b/website/src/jade/tutorials/load-new-word-vectors/index.jade @@ -3,12 +3,23 @@ include ../header.jade +WritePost(Meta) - p By default spaCy loads a #[code data/vocab/vec.bin] file, where the #[em data] directory is within the #[code spacy.en] module directory. + p By default spaCy loads a #[code data/vocab/vec.bin] file, where the #[em data] directory is within the #[code spacy.en] module directory. This file can be replaced, to customize the word vectors that spaCy loads. You can also replace the word vectors at run-time. - p You can customize the word vectors loaded by spaCy in three different ways. For the first two, you'll need to convert your vectors into spaCy's binary file format. The binary format is used because it's smaller and loads faster. - p You can either place the binary file in the location spaCy expects - + h4 Replacing vec.bin + + p The function #[code spacy.vocab.write_binary_vectors] creates a word vectors file in spaCy's binary data format. It expects a #[code bz2] file in the following format: + + pre + code + word_key1 0.92 0.45 -0.9 0.0 + word_key2 0.3 0.1 0.6 0.3 + ... + + p That is, each line is a single entry. Each entry consists of a key string, followed by a sequence of floats. Each entry should have the same number of floats. + + p The following example script will replace the #[code vec.bin] file with vectors read from a #[code bz2] archive: + pre code.language-python | from spacy.vocab import write_binary_vectors @@ -23,29 +34,38 @@ include ../header.jade | if __name__ == '__main__': | plac.call(main) - - - ol - li Replace the vec.bin, so your vectors will be loaded by default. The function #[code spacy.vocab.write_binary_vectors] is provided to convert files to spaCy's binary format. The advantage of the binary format is that it's smaller and loads faster. - - li Load vectors at run-time - - -Create the vec.bin file from a bz2 file using spacy.vocab.write_binary_vectors -Either replace spaCy's vec.bin file, or call nlp.vocab.load_rep_vectors at run-time, with the path to the binary file. -The above is a bit inconvenient at first, but the binary file format is much smaller and faster to load, and the vectors files are fairly big. Note that GloVe distributes in gzip format, not bzip. - -Out of interest: are you using the GloVe vectors, or something you trained on your own data? If your own data, did you use Gensim? I'd like to make this much easier, so I'd appreciate suggestions for what work-flow you'd like to see. - Load new vectors at run-time, optionally converting them + h4 Replace the vectors at run-time, from an archive + + p Since v0.93, instances of #[code Vocab] allow new vectors to be loaded from #[code bz2] archive files. This allows vectors to be loaded as follows: + pre code.language-python - | import spacy.vocab - - | def set_spacy_vectors(nlp, binary_loc, bz2_loc=None): - | if bz2_loc is not None: - | spacy.vocab.write_binary_vectors(bz2_loc, binary_loc) - | write_binary_vectors(bz2_input_loc, binary_loc) - | - | nlp.vocab.load_rep_vectors(binary_loc) + | >>> from spacy.en import English + | >>> nlp = English() + | >>> n_dimensions = nlp.vocab.load_vectors('glove.840B.300d.txt.bz2') + | >>> n_dimensions + | 300 + h4 Replace vectors at run-time, per word + + p Since v0.93, you can assign to the #[code .vector] attribute of #[code Lexeme] instances. Tokens of that lexical type will then inherit the updated vector. For instance: + + pre + code.language-python + | >>> from spacy.en import English + | >>> nlp = English() + | >>> apples, oranges = nlp(u'apples oranges') + | + | >>> apples_lexeme = nlp.vocab[u'apples'] + | >>> type(apples), type(apples_lexeme) + | (, ) + | >>> sum(apples.vector) + | 0.56299778164247982 + | >>> apples_lexeme.vector *= 2 + | >>> sum(apples.vector) + | 1.1259955632849596 + + p All tokens which have the #[code orth] attribute #[em apples] will inherit the updated vector. + + p Note that the updated vectors won't persist after exit, unless you persist them yourself, and then replace the #[code vec.bin] file as described above. From cecb04cfba2c596859ab91800a9f61e427d302e7 Mon Sep 17 00:00:00 2001 From: Adam Mathias Bittlingmayer Date: Thu, 24 Sep 2015 03:23:48 -0700 Subject: [PATCH 2/3] Create README.md --- website/README.md | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 website/README.md diff --git a/website/README.md b/website/README.md new file mode 100644 index 000000000..efc3cd96c --- /dev/null +++ b/website/README.md @@ -0,0 +1,30 @@ +Source for spacy.io +============================== + +This directory contains the source for official spaCy website at http://spacy.io/. + +Fixes, updates and suggestions are welcome. + + +Releases +-------- +Changes made to this directory go live on spacy.io. + + +The Stack +-------- +The site is built with the [Jade](http://jade-lang.com/) template language. + +See [the Makefile](Makefile) for more + + +Developing +-------- +To make and test changes +``` + npm install jade --global + cd website + make + python -m SimpleHTTPServer 8000 +``` +Then visit [localhost:8000/src/...](http://localhost:8000/src/) From c579b6b96cfefccf47511ca0a718b1ea8c89a301 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 24 Sep 2015 22:38:41 +1000 Subject: [PATCH 3/3] * Update English morphs.json --- lang_data/en/morphs.json | 39 +++++++++++++-------------------------- 1 file changed, 13 insertions(+), 26 deletions(-) diff --git a/lang_data/en/morphs.json b/lang_data/en/morphs.json index 69e7d98a4..669dc2884 100644 --- a/lang_data/en/morphs.json +++ b/lang_data/en/morphs.json @@ -1,14 +1,14 @@ { "PRP": { - "I": {"L": "-PRON-", "PronType": "Prs", "Person": "One", "Number": "Sing", "Case": "Nom"}, - "me": {"L": "-PRON-", "PronType": "Prs", "Person": "One", "Number": "Sing", "Case": "Acc"}, - "you": {"L": "-PRON-", "PronType": "Prs", "Person": "Two", "Case": "Nom,Acc"}, + "I": {"L": "-PRON-", "PronType": "Prs", "Person": "One", "Number": "Sing", "Case": "Nom"}, + "me": {"L": "-PRON-", "PronType": "Prs", "Person": "One", "Number": "Sing", "Case": "Acc"}, + "you": {"L": "-PRON-", "PronType": "Prs", "Person": "Two"}, "he": {"L": "-PRON-", "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Case": "Nom"}, "him": {"L": "-PRON-", "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Case": "Acc"}, - "she": {"L": "-PRON-", "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Case": "Nom"}, + "she": {"L": "-PRON-", "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem", "Case": "Nom"}, "her": {"L": "-PRON-", "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem", "Case": "Acc"}, - "it": {"L": "-PRON-", "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut", "Case": "Nom,Acc"}, - "we": {"L": "-PRON-", "PronType": "Prs", "Person": "One", "Number": "Sing", "Case": "Nom"}, + "it": {"L": "-PRON-", "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut"}, + "we": {"L": "-PRON-", "PronType": "Prs", "Person": "One", "Number": "Plur", "Case": "Nom"}, "us": {"L": "-PRON-", "PronType": "Prs", "Person": "One", "Number": "Plur", "Case": "Acc"}, "they": {"L": "-PRON-", "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Nom"}, "them": {"L": "-PRON-", "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Acc"}, @@ -35,25 +35,12 @@ }, "PRP$": { - "my": {"L": "-PRON-", "Person": "One", "Number": "Sing", "PronType": "Prs", "Poss": "Yes"}, - "your": {"L": "-PRON-", "Person": "Two", "Number": "Sing,Plur", "PronType": "Prs", "Poss": "Yes"}, - "his": {"L": "-PRON-", "Person": "Three", "Number": "Sing", "Gender": "Masc", "PronType": "Prs", "Poss": "Yes"}, - "her": {"L": "-PRON-", "Person": "Three", "Number": "Sing", "Gender": "Fem", "PronType": "Prs", "Poss": "Yes"}, - "its": {"L": "-PRON-", "Person": "Three", "Number": "Sing", "Gender": "Neut", "PronType": "Prs", "Poss": "Yes"}, - "our": {"L": "-PRON-", "Person": "One", "Number": "Plur", "PronType": "Prs", "Poss": "Yes"}, - "their": {"L": "-PRON-", "Person": "Three", "Number": "Plur", "Gender": "Masc,Fem,Neut", "PronType": "Prs", "Poss": "Yes"} - }, - "JJR": { - "better": {"L": "good", "misc": 1} - }, - "JJS": { - "best": {"L": "good", "misc": 2} - }, - - "RBR": { - "better": {"L": "good", "misc": 1} - }, - "RBS": { - "best": {"L": "good", "misc": 2} + "my": {"L": "-PRON-", "Person": "One", "Number": "Sing", "PronType": "Prs", "Poss": "Yes"}, + "your": {"L": "-PRON-", "Person": "Two", "PronType": "Prs", "Poss": "Yes"}, + "his": {"L": "-PRON-", "Person": "Three", "Number": "Sing", "Gender": "Masc", "PronType": "Prs", "Poss": "Yes"}, + "her": {"L": "-PRON-", "Person": "Three", "Number": "Sing", "Gender": "Fem", "PronType": "Prs", "Poss": "Yes"}, + "its": {"L": "-PRON-", "Person": "Three", "Number": "Sing", "Gender": "Neut", "PronType": "Prs", "Poss": "Yes"}, + "our": {"L": "-PRON-", "Person": "One", "Number": "Plur", "PronType": "Prs", "Poss": "Yes"}, + "their": {"L": "-PRON-", "Person": "Three", "Number": "Plur", "PronType": "Prs", "Poss": "Yes"} } }