Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-10-30 19:44:58 +01:00
commit 4e3006cec7
15 changed files with 392 additions and 69 deletions

View File

@ -0,0 +1,101 @@
{"lang": "en", "settings": {"oov_prob": -20.502029418945312}}
{"orth": ".", "id": 1, "lower": ".", "norm": ".", "shape": ".", "prefix": ".", "suffix": ".", "length": 1, "cluster": "8", "prob": -3.0678977966308594, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": ",", "id": 2, "lower": ",", "norm": ",", "shape": ",", "prefix": ",", "suffix": ",", "length": 1, "cluster": "4", "prob": -3.4549596309661865, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "the", "id": 3, "lower": "the", "norm": "the", "shape": "xxx", "prefix": "t", "suffix": "the", "length": 3, "cluster": "11", "prob": -3.528766632080078, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "I", "id": 4, "lower": "i", "norm": "I", "shape": "X", "prefix": "I", "suffix": "I", "length": 1, "cluster": "346", "prob": -3.791565179824829, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": true, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "to", "id": 5, "lower": "to", "norm": "to", "shape": "xx", "prefix": "t", "suffix": "to", "length": 2, "cluster": "12", "prob": -3.8560216426849365, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "a", "id": 6, "lower": "a", "norm": "a", "shape": "x", "prefix": "a", "suffix": "a", "length": 1, "cluster": "19", "prob": -3.92978835105896, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "and", "id": 7, "lower": "and", "norm": "and", "shape": "xxx", "prefix": "a", "suffix": "and", "length": 3, "cluster": "20", "prob": -4.113108158111572, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "of", "id": 8, "lower": "of", "norm": "of", "shape": "xx", "prefix": "o", "suffix": "of", "length": 2, "cluster": "28", "prob": -4.27587366104126, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "you", "id": 9, "lower": "you", "norm": "you", "shape": "xxx", "prefix": "y", "suffix": "you", "length": 3, "cluster": "602", "prob": -4.373791217803955, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "it", "id": 10, "lower": "it", "norm": "it", "shape": "xx", "prefix": "i", "suffix": "it", "length": 2, "cluster": "474", "prob": -4.388050079345703, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "is", "id": 11, "lower": "is", "norm": "is", "shape": "xx", "prefix": "i", "suffix": "is", "length": 2, "cluster": "762", "prob": -4.457748889923096, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "that", "id": 12, "lower": "that", "norm": "that", "shape": "xxxx", "prefix": "t", "suffix": "hat", "length": 4, "cluster": "84", "prob": -4.464504718780518, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "\n\n", "id": 0, "lower": "\n\n", "norm": "\n\n", "shape": "\n\n", "prefix": "\n", "suffix": "\n\n", "length": 2, "cluster": "0", "prob": -4.606560707092285, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": true, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "in", "id": 13, "lower": "in", "norm": "in", "shape": "xx", "prefix": "i", "suffix": "in", "length": 2, "cluster": "60", "prob": -4.619071960449219, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "'s", "id": 14, "lower": "'s", "norm": "'s", "shape": "'x", "prefix": "'", "suffix": "'s", "length": 2, "cluster": "52", "prob": -4.830559253692627, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "n't", "id": 15, "lower": "n't", "norm": "n't", "shape": "x'x", "prefix": "n", "suffix": "n't", "length": 3, "cluster": "74", "prob": -4.859938621520996, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "for", "id": 16, "lower": "for", "norm": "for", "shape": "xxx", "prefix": "f", "suffix": "for", "length": 3, "cluster": "508", "prob": -4.8801093101501465, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "\"", "id": 17, "lower": "\"", "norm": "\"", "shape": "\"", "prefix": "\"", "suffix": "\"", "length": 1, "cluster": "0", "prob": -5.02677583694458, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": true, "is_left_punct": true, "is_right_punct": true}
{"orth": "?", "id": 18, "lower": "?", "norm": "?", "shape": "?", "prefix": "?", "suffix": "?", "length": 1, "cluster": "0", "prob": -5.05924654006958, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": " ", "id": 0, "lower": " ", "norm": " ", "shape": " ", "prefix": " ", "suffix": " ", "length": 1, "cluster": "0", "prob": -5.129165172576904, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": true, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "have", "id": 19, "lower": "have", "norm": "have", "shape": "xxxx", "prefix": "h", "suffix": "ave", "length": 4, "cluster": "378", "prob": -5.156484603881836, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "on", "id": 20, "lower": "on", "norm": "on", "shape": "xx", "prefix": "o", "suffix": "on", "length": 2, "cluster": "2044", "prob": -5.172736167907715, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "*", "id": 21, "lower": "*", "norm": "*", "shape": "*", "prefix": "*", "suffix": "*", "length": 1, "cluster": "5098", "prob": -5.1977410316467285, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": ")", "id": 22, "lower": ")", "norm": ")", "shape": ")", "prefix": ")", "suffix": ")", "length": 1, "cluster": "0", "prob": -5.197994232177734, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": true}
{"orth": "be", "id": 23, "lower": "be", "norm": "be", "shape": "xx", "prefix": "b", "suffix": "be", "length": 2, "cluster": "458", "prob": -5.225094318389893, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "with", "id": 24, "lower": "with", "norm": "with", "shape": "xxxx", "prefix": "w", "suffix": "ith", "length": 4, "cluster": "1020", "prob": -5.243249893188477, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "do", "id": 25, "lower": "do", "norm": "do", "shape": "xx", "prefix": "d", "suffix": "do", "length": 2, "cluster": "2042", "prob": -5.246996879577637, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "was", "id": 26, "lower": "was", "norm": "was", "shape": "xxx", "prefix": "w", "suffix": "was", "length": 3, "cluster": "250", "prob": -5.252320289611816, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "are", "id": 27, "lower": "are", "norm": "are", "shape": "xxx", "prefix": "a", "suffix": "are", "length": 3, "cluster": "1530", "prob": -5.271068096160889, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "not", "id": 28, "lower": "not", "norm": "not", "shape": "xxx", "prefix": "n", "suffix": "not", "length": 3, "cluster": "1258", "prob": -5.332601070404053, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "but", "id": 29, "lower": "but", "norm": "but", "shape": "xxx", "prefix": "b", "suffix": "but", "length": 3, "cluster": "148", "prob": -5.3419694900512695, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "!", "id": 30, "lower": "!", "norm": "!", "shape": "!", "prefix": "!", "suffix": "!", "length": 1, "cluster": "0", "prob": -5.359641075134277, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "this", "id": 31, "lower": "this", "norm": "this", "shape": "xxxx", "prefix": "t", "suffix": "his", "length": 4, "cluster": "63", "prob": -5.36181640625, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "[", "id": 32, "lower": "[", "norm": "[", "shape": "[", "prefix": "[", "suffix": "[", "length": 1, "cluster": "0", "prob": -5.438112258911133, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": true, "is_right_punct": false}
{"orth": "-", "id": 33, "lower": "-", "norm": "-", "shape": "-", "prefix": "-", "suffix": "-", "length": 1, "cluster": "36", "prob": -5.468655109405518, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "my", "id": 34, "lower": "my", "norm": "my", "shape": "xx", "prefix": "m", "suffix": "my", "length": 2, "cluster": "251", "prob": -5.491642951965332, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "they", "id": 35, "lower": "they", "norm": "they", "shape": "xxxx", "prefix": "t", "suffix": "hey", "length": 4, "cluster": "90", "prob": -5.5243682861328125, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "as", "id": 36, "lower": "as", "norm": "as", "shape": "xx", "prefix": "a", "suffix": "as", "length": 2, "cluster": "212", "prob": -5.53448486328125, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "like", "id": 37, "lower": "like", "norm": "like", "shape": "xxxx", "prefix": "l", "suffix": "ike", "length": 4, "cluster": "1684", "prob": -5.610429763793945, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "just", "id": 38, "lower": "just", "norm": "just", "shape": "xxxx", "prefix": "j", "suffix": "ust", "length": 4, "cluster": "31978", "prob": -5.630868434906006, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "your", "id": 39, "lower": "your", "norm": "your", "shape": "xxxx", "prefix": "y", "suffix": "our", "length": 4, "cluster": "251", "prob": -5.650108814239502, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "or", "id": 40, "lower": "or", "norm": "or", "shape": "xx", "prefix": "o", "suffix": "or", "length": 2, "cluster": "404", "prob": -5.654984951019287, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "(", "id": 41, "lower": "(", "norm": "(", "shape": "(", "prefix": "(", "suffix": "(", "length": 1, "cluster": "0", "prob": -5.75598669052124, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": true, "is_right_punct": false}
{"orth": "at", "id": 42, "lower": "at", "norm": "at", "shape": "xx", "prefix": "a", "suffix": "at", "length": 2, "cluster": "124", "prob": -5.763442516326904, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "if", "id": 43, "lower": "if", "norm": "if", "shape": "xx", "prefix": "i", "suffix": "if", "length": 2, "cluster": "4052", "prob": -5.763589859008789, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "would", "id": 44, "lower": "would", "norm": "would", "shape": "xxxx", "prefix": "w", "suffix": "uld", "length": 5, "cluster": "1978", "prob": -5.772674560546875, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "so", "id": 45, "lower": "so", "norm": "so", "shape": "xx", "prefix": "s", "suffix": "so", "length": 2, "cluster": "2282", "prob": -5.823773384094238, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "can", "id": 46, "lower": "can", "norm": "can", "shape": "xxx", "prefix": "c", "suffix": "can", "length": 3, "cluster": "58", "prob": -5.827763080596924, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "me", "id": 47, "lower": "me", "norm": "me", "shape": "xx", "prefix": "m", "suffix": "me", "length": 2, "cluster": "1898", "prob": -5.846089839935303, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "about", "id": 48, "lower": "about", "norm": "about", "shape": "xxxx", "prefix": "a", "suffix": "out", "length": 5, "cluster": "618", "prob": -5.906808853149414, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "he", "id": 49, "lower": "he", "norm": "he", "shape": "xx", "prefix": "h", "suffix": "he", "length": 2, "cluster": "218", "prob": -5.9319047927856445, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "It", "id": 50, "lower": "it", "norm": "It", "shape": "Xx", "prefix": "I", "suffix": "It", "length": 2, "cluster": "894", "prob": -5.93662691116333, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "all", "id": 51, "lower": "all", "norm": "all", "shape": "xxx", "prefix": "a", "suffix": "all", "length": 3, "cluster": "6122", "prob": -5.936640739440918, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "The", "id": 52, "lower": "the", "norm": "The", "shape": "Xxx", "prefix": "T", "suffix": "The", "length": 3, "cluster": "30", "prob": -5.958707332611084, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "get", "id": 53, "lower": "get", "norm": "get", "shape": "xxx", "prefix": "g", "suffix": "get", "length": 3, "cluster": "2570", "prob": -5.992605686187744, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "one", "id": 54, "lower": "one", "norm": "one", "shape": "xxx", "prefix": "o", "suffix": "one", "length": 3, "cluster": "8170", "prob": -5.996385097503662, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": true, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "'m", "id": 55, "lower": "'m", "norm": "'m", "shape": "'x", "prefix": "'", "suffix": "'m", "length": 2, "cluster": "3066", "prob": -5.9999823570251465, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "out", "id": 56, "lower": "out", "norm": "out", "shape": "xxx", "prefix": "o", "suffix": "out", "length": 3, "cluster": "1386", "prob": -6.0027008056640625, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "from", "id": 57, "lower": "from", "norm": "from", "shape": "xxxx", "prefix": "f", "suffix": "rom", "length": 4, "cluster": "380", "prob": -6.010132312774658, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "an", "id": 58, "lower": "an", "norm": "an", "shape": "xx", "prefix": "a", "suffix": "an", "length": 2, "cluster": "3", "prob": -6.014852046966553, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "what", "id": 59, "lower": "what", "norm": "what", "shape": "xxxx", "prefix": "w", "suffix": "hat", "length": 4, "cluster": "2026", "prob": -6.023346424102783, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "up", "id": 60, "lower": "up", "norm": "up", "shape": "xx", "prefix": "u", "suffix": "up", "length": 2, "cluster": "362", "prob": -6.028695583343506, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "]", "id": 61, "lower": "]", "norm": "]", "shape": "]", "prefix": "]", "suffix": "]", "length": 1, "cluster": "0", "prob": -6.0386552810668945, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": true}
{"orth": "\n", "id": 0, "lower": "\n", "norm": "\n", "shape": "\n", "prefix": "\n", "suffix": "\n", "length": 1, "cluster": "0", "prob": -6.0506510734558105, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": true, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "people", "id": 62, "lower": "people", "norm": "people", "shape": "xxxx", "prefix": "p", "suffix": "ple", "length": 6, "cluster": "365", "prob": -6.0715765953063965, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "more", "id": 63, "lower": "more", "norm": "more", "shape": "xxxx", "prefix": "m", "suffix": "ore", "length": 4, "cluster": "1514", "prob": -6.081598281860352, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": ":", "id": 64, "lower": ":", "norm": ":", "shape": ":", "prefix": ":", "suffix": ":", "length": 1, "cluster": "228", "prob": -6.128875732421875, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "there", "id": 65, "lower": "there", "norm": "there", "shape": "xxxx", "prefix": "t", "suffix": "ere", "length": 5, "cluster": "986", "prob": -6.135282039642334, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "deleted", "id": 66, "lower": "deleted", "norm": "deleted", "shape": "xxxx", "prefix": "d", "suffix": "ted", "length": 7, "cluster": "1706", "prob": -6.1543049812316895, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "think", "id": 67, "lower": "think", "norm": "think", "shape": "xxxx", "prefix": "t", "suffix": "ink", "length": 5, "cluster": "1674", "prob": -6.180924892425537, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "will", "id": 68, "lower": "will", "norm": "will", "shape": "xxxx", "prefix": "w", "suffix": "ill", "length": 4, "cluster": "442", "prob": -6.199834823608398, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "them", "id": 69, "lower": "them", "norm": "them", "shape": "xxxx", "prefix": "t", "suffix": "hem", "length": 4, "cluster": "5994", "prob": -6.2177276611328125, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "we", "id": 70, "lower": "we", "norm": "we", "shape": "xx", "prefix": "w", "suffix": "we", "length": 2, "cluster": "1626", "prob": -6.230024337768555, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "'re", "id": 71, "lower": "'re", "norm": "'re", "shape": "'xx", "prefix": "'", "suffix": "'re", "length": 3, "cluster": "7162", "prob": -6.255462646484375, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "when", "id": 72, "lower": "when", "norm": "when", "shape": "xxxx", "prefix": "w", "suffix": "hen", "length": 4, "cluster": "16340", "prob": -6.2623114585876465, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "You", "id": 73, "lower": "you", "norm": "You", "shape": "Xxx", "prefix": "Y", "suffix": "You", "length": 3, "cluster": "858", "prob": -6.276494026184082, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "...", "id": 74, "lower": "...", "norm": "...", "shape": "...", "prefix": ".", "suffix": "...", "length": 3, "cluster": "966", "prob": -6.278521537780762, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "some", "id": 75, "lower": "some", "norm": "some", "shape": "xxxx", "prefix": "s", "suffix": "ome", "length": 4, "cluster": "239", "prob": -6.318882465362549, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "has", "id": 76, "lower": "has", "norm": "has", "shape": "xxx", "prefix": "h", "suffix": "has", "length": 3, "cluster": "890", "prob": -6.325605392456055, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "because", "id": 77, "lower": "because", "norm": "because", "shape": "xxxx", "prefix": "b", "suffix": "use", "length": 7, "cluster": "980", "prob": -6.349620342254639, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "know", "id": 78, "lower": "know", "norm": "know", "shape": "xxxx", "prefix": "k", "suffix": "now", "length": 4, "cluster": "3722", "prob": -6.368943214416504, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "really", "id": 79, "lower": "really", "norm": "really", "shape": "xxxx", "prefix": "r", "suffix": "lly", "length": 6, "cluster": "7802", "prob": -6.370757102966309, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "by", "id": 80, "lower": "by", "norm": "by", "shape": "xx", "prefix": "b", "suffix": "by", "length": 2, "cluster": "252", "prob": -6.375086784362793, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "time", "id": 81, "lower": "time", "norm": "time", "shape": "xxxx", "prefix": "t", "suffix": "ime", "length": 4, "cluster": "477", "prob": -6.3782219886779785, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "did", "id": 82, "lower": "did", "norm": "did", "shape": "xxx", "prefix": "d", "suffix": "did", "length": 3, "cluster": "8186", "prob": -6.389003753662109, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "no", "id": 83, "lower": "no", "norm": "no", "shape": "xx", "prefix": "n", "suffix": "no", "length": 2, "cluster": "4074", "prob": -6.402691841125488, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "had", "id": 84, "lower": "had", "norm": "had", "shape": "xxx", "prefix": "h", "suffix": "had", "length": 3, "cluster": "1914", "prob": -6.45427131652832, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "their", "id": 85, "lower": "their", "norm": "their", "shape": "xxxx", "prefix": "t", "suffix": "eir", "length": 5, "cluster": "187", "prob": -6.461463928222656, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "If", "id": 86, "lower": "if", "norm": "If", "shape": "Xx", "prefix": "I", "suffix": "If", "length": 2, "cluster": "190", "prob": -6.469156742095947, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "how", "id": 87, "lower": "how", "norm": "how", "shape": "xxx", "prefix": "h", "suffix": "how", "length": 3, "cluster": "10218", "prob": -6.496722221374512, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "does", "id": 88, "lower": "does", "norm": "does", "shape": "xxxx", "prefix": "d", "suffix": "oes", "length": 4, "cluster": "4090", "prob": -6.500738143920898, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "who", "id": 89, "lower": "who", "norm": "who", "shape": "xxx", "prefix": "w", "suffix": "who", "length": 3, "cluster": "410", "prob": -6.504637241363525, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "than", "id": 90, "lower": "than", "norm": "than", "shape": "xxxx", "prefix": "t", "suffix": "han", "length": 4, "cluster": "106", "prob": -6.512253761291504, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "good", "id": 91, "lower": "good", "norm": "good", "shape": "xxxx", "prefix": "g", "suffix": "ood", "length": 4, "cluster": "551", "prob": -6.518923759460449, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "only", "id": 92, "lower": "only", "norm": "only", "shape": "xxxx", "prefix": "o", "suffix": "nly", "length": 4, "cluster": "15594", "prob": -6.535442352294922, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "his", "id": 93, "lower": "his", "norm": "his", "shape": "xxx", "prefix": "h", "suffix": "his", "length": 3, "cluster": "123", "prob": -6.574275016784668, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "much", "id": 94, "lower": "much", "norm": "much", "shape": "xxxx", "prefix": "m", "suffix": "uch", "length": 4, "cluster": "2794", "prob": -6.584301948547363, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": ";", "id": 95, "lower": ";", "norm": ";", "shape": ";", "prefix": ";", "suffix": ";", "length": 1, "cluster": "36", "prob": -6.586422920227051, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "'ve", "id": 96, "lower": "'ve", "norm": "'ve", "shape": "'xx", "prefix": "'", "suffix": "'ve", "length": 3, "cluster": "1018", "prob": -6.593011379241943, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
{"orth": "could", "id": 97, "lower": "could", "norm": "could", "shape": "xxxx", "prefix": "c", "suffix": "uld", "length": 5, "cluster": "954", "prob": -6.595959186553955, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}

View File

@ -19,7 +19,7 @@ if __name__ == '__main__':
'convert': convert, 'convert': convert,
'package': package, 'package': package,
'model': model, 'model': model,
'model': vocab, 'vocab': vocab,
'profile': profile, 'profile': profile,
'validate': validate 'validate': validate
} }

View File

@ -17,14 +17,14 @@ numpy.random.seed(0)
@plac.annotations( @plac.annotations(
model=("Model name or path", "positional", None, str), model=("model name or path", "positional", None, str),
data_path=("Location of JSON-formatted evaluation data", "positional", data_path=("location of JSON-formatted evaluation data", "positional",
None, str), None, str),
gold_preproc=("Use gold preprocessing", "flag", "G", bool), gold_preproc=("use gold preprocessing", "flag", "G", bool),
gpu_id=("Use GPU", "option", "g", int), gpu_id=("use GPU", "option", "g", int),
displacy_path=("Directory to output rendered parses as HTML", "option", displacy_path=("directory to output rendered parses as HTML", "option",
"dp", str), "dp", str),
displacy_limit=("Limit of parses to render as HTML", "option", "dl", int)) displacy_limit=("limit of parses to render as HTML", "option", "dl", int))
def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False, def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
displacy_path=None, displacy_limit=25): displacy_path=None, displacy_limit=25):
""" """

View File

@ -16,10 +16,11 @@ from .. import about
input_dir=("directory with model data", "positional", None, str), input_dir=("directory with model data", "positional", None, str),
output_dir=("output parent directory", "positional", None, str), output_dir=("output parent directory", "positional", None, str),
meta_path=("path to meta.json", "option", "m", str), meta_path=("path to meta.json", "option", "m", str),
create_meta=("create meta.json, even if one exists in directory", "flag", create_meta=("create meta.json, even if one exists in directory if "
"c", bool), "existing meta is found, entries are shown as defaults in "
force=("force overwriting of existing folder in output directory", "flag", "the command line prompt", "flag", "c", bool),
"f", bool)) force=("force overwriting of existing model directory in output directory",
"flag", "f", bool))
def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False,
force=False): force=False):
""" """
@ -41,13 +42,13 @@ def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False,
template_manifest = get_template('MANIFEST.in') template_manifest = get_template('MANIFEST.in')
template_init = get_template('xx_model_name/__init__.py') template_init = get_template('xx_model_name/__init__.py')
meta_path = meta_path or input_path / 'meta.json' meta_path = meta_path or input_path / 'meta.json'
if not create_meta and meta_path.is_file(): if meta_path.is_file():
prints(meta_path, title="Reading meta.json from file")
meta = util.read_json(meta_path) meta = util.read_json(meta_path)
if not create_meta: # only print this if user doesn't want to overwrite
prints(meta_path, title="Loaded meta.json from file")
else: else:
meta = generate_meta(input_dir) meta = generate_meta(input_dir, meta)
meta = validate_meta(meta, ['lang', 'name', 'version']) meta = validate_meta(meta, ['lang', 'name', 'version'])
model_name = meta['lang'] + '_' + meta['name'] model_name = meta['lang'] + '_' + meta['name']
model_name_v = model_name + '-' + meta['version'] model_name_v = model_name + '-' + meta['version']
main_path = output_path / model_name_v main_path = output_path / model_name_v
@ -82,18 +83,19 @@ def create_file(file_path, contents):
file_path.open('w', encoding='utf-8').write(contents) file_path.open('w', encoding='utf-8').write(contents)
def generate_meta(model_path): def generate_meta(model_path, existing_meta):
meta = {} meta = existing_meta or {}
settings = [('lang', 'Model language', 'en'), settings = [('lang', 'Model language', meta.get('lang', 'en')),
('name', 'Model name', 'model'), ('name', 'Model name', meta.get('name', 'model')),
('version', 'Model version', '0.0.0'), ('version', 'Model version', meta.get('version', '0.0.0')),
('spacy_version', 'Required spaCy version', ('spacy_version', 'Required spaCy version',
'>=%s,<3.0.0' % about.__version__), '>=%s,<3.0.0' % about.__version__),
('description', 'Model description', False), ('description', 'Model description',
('author', 'Author', False), meta.get('description', False)),
('email', 'Author email', False), ('author', 'Author', meta.get('author', False)),
('url', 'Author website', False), ('email', 'Author email', meta.get('email', False)),
('license', 'License', 'CC BY-NC 3.0')] ('url', 'Author website', meta.get('url', False)),
('license', 'License', meta.get('license', 'CC BY-SA 3.0'))]
nlp = util.load_model_from_path(Path(model_path)) nlp = util.load_model_from_path(Path(model_path))
meta['pipeline'] = nlp.pipe_names meta['pipeline'] = nlp.pipe_names
meta['vectors'] = {'width': nlp.vocab.vectors_length, meta['vectors'] = {'width': nlp.vocab.vectors_length,

View File

@ -1,31 +1,33 @@
'''Compile a vocabulary from a lexicon jsonl file and word vectors.'''
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from pathlib import Path
import plac import plac
import json import json
import spacy import spacy
import numpy import numpy
from spacy.util import ensure_path from pathlib import Path
from ..util import prints, ensure_path
@plac.annotations( @plac.annotations(
lang=("model language", "positional", None, str), lang=("model language", "positional", None, str),
output_dir=("output directory to store model in", "positional", None, str), output_dir=("model output directory", "positional", None, Path),
lexemes_loc=("location of JSONL-formatted lexical data", "positional", lexemes_loc=("location of JSONL-formatted lexical data", "positional",
None, str), None, Path),
vectors_loc=("location of vectors data, as numpy .npz (optional)", vectors_loc=("optional: location of vectors data, as numpy .npz",
"positional", None, str), "positional", None, str))
version=("Model version", "option", "V", str), def make_vocab(cmd, lang, output_dir, lexemes_loc, vectors_loc=None):
) """Compile a vocabulary from a lexicon jsonl file and word vectors."""
def make_vocab(lang, output_dir, lexemes_loc, vectors_loc=None, version=None): if not lexemes_loc.exists():
out_dir = ensure_path(output_dir) prints(lexemes_loc, title="Can't find lexical data", exits=1)
jsonl_loc = ensure_path(lexemes_loc) vectors_loc = ensure_path(vectors_loc)
nlp = spacy.blank(lang) nlp = spacy.blank(lang)
for word in nlp.vocab: for word in nlp.vocab:
word.rank = 0 word.rank = 0
with jsonl_loc.open() as file_: lex_added = 0
vec_added = 0
with lexemes_loc.open() as file_:
for line in file_: for line in file_:
if line.strip(): if line.strip():
attrs = json.loads(line) attrs = json.loads(line)
@ -35,14 +37,18 @@ def make_vocab(lang, output_dir, lexemes_loc, vectors_loc=None, version=None):
lex = nlp.vocab[attrs['orth']] lex = nlp.vocab[attrs['orth']]
lex.set_attrs(**attrs) lex.set_attrs(**attrs)
assert lex.rank == attrs['id'] assert lex.rank == attrs['id']
lex_added += 1
if vectors_loc is not None: if vectors_loc is not None:
vector_data = numpy.load(open(vectors_loc, 'rb')) vector_data = numpy.load(open(vectors_loc, 'rb'))
nlp.vocab.clear_vectors(width=vector_data.shape[1]) nlp.vocab.clear_vectors(width=vector_data.shape[1])
added = 0
for word in nlp.vocab: for word in nlp.vocab:
if word.rank: if word.rank:
nlp.vocab.vectors.add(word.orth_, row=word.rank, nlp.vocab.vectors.add(word.orth_, row=word.rank,
vector=vector_data[word.rank]) vector=vector_data[word.rank])
added += 1 vec_added += 1
nlp.to_disk(out_dir) if not output_dir.exists():
output_dir.mkdir()
nlp.to_disk(output_dir)
prints("{} entries, {} vectors".format(lex_added, vec_added), output_dir,
title="Sucessfully compiled vocab and vectors, and saved model")
return nlp return nlp

View File

@ -154,6 +154,8 @@ class Language(object):
self._meta.setdefault('email', '') self._meta.setdefault('email', '')
self._meta.setdefault('url', '') self._meta.setdefault('url', '')
self._meta.setdefault('license', '') self._meta.setdefault('license', '')
self._meta['vectors'] = {'width': self.vocab.vectors_length,
'entries': len(self.vocab.vectors)}
self._meta['pipeline'] = self.pipe_names self._meta['pipeline'] = self.pipe_names
return self._meta return self._meta

View File

@ -263,6 +263,15 @@ cdef class Vocab:
The similarities are judged by cosine. The original vectors may The similarities are judged by cosine. The original vectors may
be large, so the cosines are calculated in minibatches, to reduce be large, so the cosines are calculated in minibatches, to reduce
memory usage. memory usage.
nr_row (int): The number of rows to keep in the vector table.
batch_size (int): Batch of vectors for calculating the similarities.
Larger batch sizes might be faster, while temporarily requiring
more memory.
RETURNS (dict): A dictionary keyed by removed words mapped to
`(string, score)` tuples, where `string` is the entry the removed
word was mapped to, and `score` the similarity score between the
two words.
""" """
xp = get_array_module(self.vectors.data) xp = get_array_module(self.vectors.data)
# Work in batches, to avoid memory problems. # Work in batches, to avoid memory problems.
@ -294,6 +303,7 @@ cdef class Vocab:
self.vectors.key2row[key] = neighbours[row-nr_row] self.vectors.key2row[key] = neighbours[row-nr_row]
# Make copy, to encourage the original table to be garbage collected. # Make copy, to encourage the original table to be garbage collected.
self.vectors.data = xp.ascontiguousarray(self.vectors.data[:nr_row]) self.vectors.data = xp.ascontiguousarray(self.vectors.data[:nr_row])
# TODO: return new mapping
def get_vector(self, orth): def get_vector(self, orth):
"""Retrieve a vector for a word in the vocabulary. Words can be looked """Retrieve a vector for a word in the vocabulary. Words can be looked

View File

@ -41,9 +41,6 @@
- var comps = path.split('#'); - var comps = path.split('#');
- return "top-level#" + comps[0] + '.' + comps[1]; - return "top-level#" + comps[0] + '.' + comps[1];
- } - }
- else if (path.startsWith('cli#')) {
- return "top-level#" + path.split('#')[1];
- }
- return path; - return path;
- } - }

View File

@ -1,5 +1,7 @@
//- 💫 DOCS > API > ANNOTATION > TRAINING //- 💫 DOCS > API > ANNOTATION > TRAINING
+h(3, "json-input") JSON input format for training
p p
| spaCy takes training data in JSON format. The built-in | spaCy takes training data in JSON format. The built-in
| #[+api("cli#convert") #[code convert]] command helps you convert the | #[+api("cli#convert") #[code convert]] command helps you convert the
@ -46,3 +48,57 @@ p
| Treebank: | Treebank:
+github("spacy", "examples/training/training-data.json", false, false, "json") +github("spacy", "examples/training/training-data.json", false, false, "json")
+h(3, "vocab-jsonl") Lexical data for vocabulary
+tag-new(2)
p
| The populate a model's vocabulary, you can use the
| #[+api("cli#vocab") #[code spacy vocab]] command and load in a
| #[+a("https://jsonlines.readthedocs.io/en/latest/") newline-delimited JSON]
| (JSONL) file containing one lexical entry per line. The first line
| defines the language and vocabulary settings. All other lines are
| expected to be JSON objects describing an individual lexeme. The lexical
| attributes will be then set as attributes on spaCy's
| #[+api("lexeme#attributes") #[code Lexeme]] object. The #[code vocab]
| command outputs a ready-to-use spaCy model with a #[code Vocab]
| containing the lexical data.
+code("First line").
{"lang": "en", "settings": {"oov_prob": -20.502029418945312}}
+code("Entry structure").
{
"orth": string,
"id": int,
"lower": string,
"norm": string,
"shape": string
"prefix": string,
"suffix": string,
"length": int,
"cluster": string,
"prob": float,
"is_alpha": bool,
"is_ascii": bool,
"is_digit": bool,
"is_lower": bool,
"is_punct": bool,
"is_space": bool,
"is_title": bool,
"is_upper": bool,
"like_url": bool,
"like_num": bool,
"like_email": bool,
"is_stop": bool,
"is_oov": bool,
"is_quote": bool,
"is_left_punct": bool,
"is_right_punct": bool
}
p
| Here's an example of the 100 most frequent lexemes in the English
| training data:
+github("spacy", "examples/training/vocab-data.jsonl", false, false, "json")

View File

@ -3,8 +3,10 @@
"Overview": { "Overview": {
"Architecture": "./", "Architecture": "./",
"Annotation Specs": "annotation", "Annotation Specs": "annotation",
"Command Line": "cli",
"Functions": "top-level" "Functions": "top-level"
}, },
"Containers": { "Containers": {
"Doc": "doc", "Doc": "doc",
"Token": "token", "Token": "token",
@ -45,14 +47,19 @@
} }
}, },
"cli": {
"title": "Command Line Interface",
"teaser": "Download, train and package models, and debug spaCy.",
"source": "spacy/cli"
},
"top-level": { "top-level": {
"title": "Top-level Functions", "title": "Top-level Functions",
"menu": { "menu": {
"spacy": "spacy", "spacy": "spacy",
"displacy": "displacy", "displacy": "displacy",
"Utility Functions": "util", "Utility Functions": "util",
"Compatibility": "compat", "Compatibility": "compat"
"Command Line": "cli"
} }
}, },
@ -213,7 +220,7 @@
"Lemmatization": "lemmatization", "Lemmatization": "lemmatization",
"Dependencies": "dependency-parsing", "Dependencies": "dependency-parsing",
"Named Entities": "named-entities", "Named Entities": "named-entities",
"Training Data": "training" "Models & Training": "training"
} }
} }
} }

View File

@ -85,7 +85,9 @@ p
+row +row
+cell #[code name] +cell #[code name]
+cell unicode +cell unicode
+cell ISO code of the language class to load. +cell
| #[+a("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes") ISO code]
| of the language class to load.
+row +row
+cell #[code disable] +cell #[code disable]

View File

@ -99,6 +99,6 @@ p This document describes the target annotations spaCy is trained to predict.
include _annotation/_biluo include _annotation/_biluo
+section("training") +section("training")
+h(2, "json-input") JSON input format for training +h(2, "training") Models and training data
include _annotation/_training include _annotation/_training

View File

@ -1,4 +1,6 @@
//- 💫 DOCS > API > TOP-LEVEL > COMMAND LINE INTERFACE //- 💫 DOCS > API > COMMAND LINE INTERFACE
include ../_includes/_mixins
p p
| As of v1.7.0, spaCy comes with new command line helpers to download and | As of v1.7.0, spaCy comes with new command line helpers to download and
@ -34,6 +36,13 @@ p
+cell flag +cell flag
+cell Show help message and available arguments. +cell Show help message and available arguments.
+row("foot")
+cell creates
+cell directory, symlink
+cell
| The installed model package in your #[code site-packages]
| directory and a shortcut link as a symlink in #[code spacy/data].
+aside("Downloading best practices") +aside("Downloading best practices")
| The #[code download] command is mostly intended as a convenient, | The #[code download] command is mostly intended as a convenient,
| interactive wrapper it performs compatibility checks and prints | interactive wrapper it performs compatibility checks and prints
@ -86,6 +95,13 @@ p
+cell flag +cell flag
+cell Show help message and available arguments. +cell Show help message and available arguments.
+row("foot")
+cell creates
+cell symlink
+cell
| A shortcut link of the given name as a symlink in
| #[code spacy/data].
+h(3, "info") Info +h(3, "info") Info
p p
@ -113,6 +129,11 @@ p
+cell flag +cell flag
+cell Show help message and available arguments. +cell Show help message and available arguments.
+row("foot")
+cell prints
+cell #[code stdout]
+cell Information about your spaCy installation.
+h(3, "validate") Validate +h(3, "validate") Validate
+tag-new(2) +tag-new(2)
@ -129,6 +150,12 @@ p
+code(false, "bash", "$"). +code(false, "bash", "$").
spacy validate spacy validate
+table(["Argument", "Type", "Description"])
+row("foot")
+cell prints
+cell #[code stdout]
+cell Details about the compatibility of your installed models.
+h(3, "convert") Convert +h(3, "convert") Convert
p p
@ -172,6 +199,11 @@ p
+cell flag +cell flag
+cell Show help message and available arguments. +cell Show help message and available arguments.
+row("foot")
+cell creates
+cell JSON
+cell Data in spaCy's #[+a("/api/annotation#json-input") JSON format].
p The following converters are available: p The following converters are available:
+table(["ID", "Description"]) +table(["ID", "Description"])
@ -286,6 +318,11 @@ p
+cell flag +cell flag
+cell Show help message and available arguments. +cell Show help message and available arguments.
+row("foot")
+cell creates
+cell model, pickle
+cell A spaCy model on each epoch, and a final #[code .pickle] file.
+h(4, "train-hyperparams") Environment variables for hyperparameters +h(4, "train-hyperparams") Environment variables for hyperparameters
+tag-new(2) +tag-new(2)
@ -395,6 +432,50 @@ p
+cell Gradient L2 norm constraint. +cell Gradient L2 norm constraint.
+cell #[code 1.0] +cell #[code 1.0]
+h(3, "vocab") Vocab
+tag-new(2)
p
| Compile a vocabulary from a
| #[+a("/api/annotation#vocab-jsonl") lexicon JSONL] file and optional
| word vectors. Will save out a valid spaCy model that you can load via
| #[+api("spacy#load") #[code spacy.load]] or package using the
| #[+api("cli#package") #[code package]] command.
+code(false, "bash", "$").
spacy vocab [lang] [output_dir] [lexemes_loc] [vectors_loc]
+table(["Argument", "Type", "Description"])
+row
+cell #[code lang]
+cell positional
+cell
| Model language
| #[+a("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes") ISO code],
| e.g. #[code en].
+row
+cell #[code output_dir]
+cell positional
+cell Model output directory. Will be created if it doesn't exist.
+row
+cell #[code lexemes_loc]
+cell positional
+cell
| Location of lexical data in spaCy's
| #[+a("/api/annotation#vocab-jsonl") JSONL format].
+row
+cell #[code vectors_loc]
+cell positional
+cell Optional location of vectors data as numpy #[code .npz] file.
+row("foot")
+cell creates
+cell model
+cell A spaCy model containing the vocab and vectors.
+h(3, "evaluate") Evaluate +h(3, "evaluate") Evaluate
+tag-new(2) +tag-new(2)
@ -447,22 +528,36 @@ p
+cell flag +cell flag
+cell Use gold preprocessing. +cell Use gold preprocessing.
+row("foot")
+cell prints / creates
+cell #[code stdout], HTML
+cell Training results and optional displaCy visualizations.
+h(3, "package") Package +h(3, "package") Package
p p
| Generate a #[+a("/usage/training#models-generating") model Python package] | Generate a #[+a("/usage/training#models-generating") model Python package]
| from an existing model data directory. All data files are copied over. | from an existing model data directory. All data files are copied over.
| If the path to a meta.json is supplied, or a meta.json is found in the | If the path to a #[code meta.json] is supplied, or a #[code meta.json] is
| input directory, this file is used. Otherwise, the data can be entered | found in the input directory, this file is used. Otherwise, the data can
| directly from the command line. The required file templates are downloaded | be entered directly from the command line. The required file templates
| from #[+src(gh("spacy-dev-resources", "templates/model")) GitHub] to make | are downloaded from
| #[+src(gh("spacy-dev-resources", "templates/model")) GitHub] to make
| sure you're always using the latest versions. This means you need to be | sure you're always using the latest versions. This means you need to be
| connected to the internet to use this command. | connected to the internet to use this command. After packaging, you
| can run #[code python setup.py sdist] from the newly created directory
| to turn your model into an installable archive file.
+code(false, "bash", "$", false, false, true). +code(false, "bash", "$", false, false, true).
spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] [--force] spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] [--force]
+aside-code("Example", "bash").
spacy package /input /output
cd /output/en_model-0.0.0
python setup.py sdist
pip install dist/en_model-0.0.0.tar.gz
+table(["Argument", "Type", "Description"]) +table(["Argument", "Type", "Description"])
+row +row
+cell #[code input_dir] +cell #[code input_dir]
@ -477,15 +572,16 @@ p
+row +row
+cell #[code --meta-path], #[code -m] +cell #[code --meta-path], #[code -m]
+cell option +cell option
+cell #[+tag-new(2)] Path to meta.json file (optional). +cell #[+tag-new(2)] Path to #[code meta.json] file (optional).
+row +row
+cell #[code --create-meta], #[code -c] +cell #[code --create-meta], #[code -c]
+cell flag +cell flag
+cell +cell
| #[+tag-new(2)] Create a meta.json file on the command line, even | #[+tag-new(2)] Create a #[code meta.json] file on the command
| if one already exists in the directory. | line, even if one already exists in the directory. If an
| existing file is found, its entries will be shown as the defaults
| in the command line prompt.
+row +row
+cell #[code --force], #[code -f] +cell #[code --force], #[code -f]
+cell flag +cell flag
@ -495,3 +591,8 @@ p
+cell #[code --help], #[code -h] +cell #[code --help], #[code -h]
+cell flag +cell flag
+cell Show help message and available arguments. +cell Show help message and available arguments.
+row("foot")
+cell creates
+cell directory
+cell A Python package containing the spaCy model.

View File

@ -18,7 +18,3 @@ include ../_includes/_mixins
+section("compat") +section("compat")
+h(2, "compat", "spacy/compaty.py") Compatibility functions +h(2, "compat", "spacy/compaty.py") Compatibility functions
include _top-level/_compat include _top-level/_compat
+section("cli", "spacy/cli")
+h(2, "cli") Command line
include _top-level/_cli

View File

@ -162,7 +162,7 @@ p
+cell int +cell int
+cell The integer ID by which the flag value can be checked. +cell The integer ID by which the flag value can be checked.
+h(2, "add_flag") Vocab.clear_vectors +h(2, "clear_vectors") Vocab.clear_vectors
+tag method +tag method
+tag-new(2) +tag-new(2)
@ -181,7 +181,50 @@ p
| Number of dimensions of the new vectors. If #[code None], size | Number of dimensions of the new vectors. If #[code None], size
| is not changed. | is not changed.
+h(2, "add_flag") Vocab.get_vector +h(2, "prune_vectors") Vocab.prune_vectors
+tag method
+tag-new(2)
p
| Reduce the current vector table to #[code nr_row] unique entries. Words
| mapped to the discarded vectors will be remapped to the closest vector
| among those remaining. For example, suppose the original table had
| vectors for the words:
| #[code.u-break ['sat', 'cat', 'feline', 'reclined']]. If we prune the
| vector table to, two rows, we would discard the vectors for "feline"
| and "reclined". These words would then be remapped to the closest
| remaining vector so "feline" would have the same vector as "cat",
| and "reclined" would have the same vector as "sat". The similarities are
| judged by cosine. The original vectors may be large, so the cosines are
| calculated in minibatches, to reduce memory usage.
+aside-code("Example").
nlp.vocab.prune_vectors(10000)
assert len(nlp.vocab.vectors) &lt;= 1000
+table(["Name", "Type", "Description"])
+row
+cell #[code nr_row]
+cell int
+cell The number of rows to keep in the vector table.
+row
+cell #[code batch_size]
+cell int
+cell
| Batch of vectors for calculating the similarities. Larger batch
| sizes might be faster, while temporarily requiring more memory.
+row("foot")
+cell returns
+cell dict
+cell
| A dictionary keyed by removed words mapped to
| #[code (string, score)] tuples, where #[code string] is the entry
| the removed word was mapped to, and #[code score] the similarity
| score between the two words.
+h(2, "get_vector") Vocab.get_vector
+tag method +tag method
+tag-new(2) +tag-new(2)
@ -206,7 +249,7 @@ p
| A word vector. Size and shape are determined by the | A word vector. Size and shape are determined by the
| #[code Vocab.vectors] instance. | #[code Vocab.vectors] instance.
+h(2, "add_flag") Vocab.set_vector +h(2, "set_vector") Vocab.set_vector
+tag method +tag method
+tag-new(2) +tag-new(2)
@ -228,7 +271,7 @@ p
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+cell The vector to set. +cell The vector to set.
+h(2, "add_flag") Vocab.has_vector +h(2, "has_vector") Vocab.has_vector
+tag method +tag method
+tag-new(2) +tag-new(2)