From df110476d5bd090413562290e61b9aeea608459a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 15 Oct 2014 21:50:34 +1100 Subject: [PATCH] * Update docs --- docs/source/guide/chart.svg | 4 + docs/source/guide/overview.rst | 121 +++++++----------- docs/source/{ => how}/api/index.rst | 0 docs/source/{ => how}/api/lexicon.rst | 0 docs/source/{ => how}/api/tokenizers/en.rst | 0 .../source/{ => how}/api/tokenizers/index.rst | 0 docs/source/how/index.rst | 13 ++ docs/source/index.rst | 81 ++++-------- docs/source/what/index.rst | 31 +++++ docs/source/why/index.rst | 28 ++++ 10 files changed, 147 insertions(+), 131 deletions(-) create mode 100644 docs/source/guide/chart.svg rename docs/source/{ => how}/api/index.rst (100%) rename docs/source/{ => how}/api/lexicon.rst (100%) rename docs/source/{ => how}/api/tokenizers/en.rst (100%) rename docs/source/{ => how}/api/tokenizers/index.rst (100%) create mode 100644 docs/source/how/index.rst create mode 100644 docs/source/what/index.rst create mode 100644 docs/source/why/index.rst diff --git a/docs/source/guide/chart.svg b/docs/source/guide/chart.svg new file mode 100644 index 000000000..3ac5ec2fc --- /dev/null +++ b/docs/source/guide/chart.svg @@ -0,0 +1,4 @@ + +# Types vs. # Tokens in English Gigaword100M200M300M400M500M600M700M800M900M1G1.1G1.2G1.3G1.4G1.5G1.6G# Types vs. # Tokens in English Gigaword# Files51.925k6.09230769231164.76923076991.041k7.13551106428164.765503744118.418k8.17871443625164.762895227148.113k9.22191780822164.760065848171.386k10.2651211802164.757848365195.482k11.3083245522164.755552466216.697k12.3515279241164.753531073235.63k13.3947312961164.751727111255.662k14.4379346681164.749818435276.115k15.48113804164.747869646298.019k16.524341412164.745782604316.206k17.567544784164.744049722334.914k18.610748156164.742267199353.556k19.6539515279164.740490964371.378k20.6971548999164.73879286387.398k21.7403582719164.737266453404.005k22.7835616438164.735684116422.061k23.8267650158164.733963716439.214k24.8699683878164.732329355454.576k25.9131717597164.730865643473.222k26.9563751317164.729089027490.952k27.9995785037164.727399689510.241k29.0427818757164.725561807528.058k30.0859852476164.72386418544.46k31.1291886196164.722301375562.775k32.1723919916164.720556297582.928k33.2155953635164.718636093603.038k34.2587987355164.716719985623.867k35.3020021075164.71473537646.212k36.3452054795164.712606308666.459k37.3884088514164.710677147683.61k38.4316122234164.709042977702.196k39.4748155954164.707272078720.154k40.5180189673164.705561016739.217k41.5612223393164.703744668757.271k42.6044257113164.702024458764.619k43.6476290832164.701324331783.204k44.6908324552164.699553527812.395k45.7340358272164.69677217838.632k46.7772391992164.694272273862.399k47.8204425711164.692007722884.67k48.8636459431164.689885711911.017k49.9068493151164.687375334935.195k50.950052687164.685071621935.195k51.993256059164.685071621952.408k53.036459431164.683431544968.59k54.079662803164.681889701983.132k55.1228661749164.680504121M56.1660695469164.6788558481.017M57.2092729189164.677230731.035M58.2524762908164.6755435831.052M59.2956796628164.6739043631.068M60.3388830348164.6724506551.083M61.3820864067164.670958741.099M62.4252897787164.66946531.111M63.4684931507164.6683054411.124M64.5116965227164.6671268111.134M65.5548998946164.6660828131.145M66.5981032666164.6650396731.156M67.6413066386164.6640554171.167M68.6845100105164.6629945541.178M69.7277133825164.6619724711.189M70.7709167545164.6608886461.199M71.8141201264164.6599452661.21M72.8573234984164.6589112721.218M73.9005268704164.6580981411.228M74.9437302424164.6571978281.238M75.9869336143164.6562143341.249M77.0301369863164.6552153091.259M78.0733403583164.6542407721.269M79.1165437302164.653246131.279M80.1597471022164.6523076091.29M81.2029504742164.6512638021.301M82.2461538462164.6502398131.312M83.2893572181164.6491589421.324M84.3325605901164.6480404341.336M85.3757639621164.6468735231.348M86.418967334164.6457787411.36M87.462170706164.6446316491.372M88.505374078164.6434696931.392M89.5485774499164.6415192841.422M90.5917808219164.6386688471.458M91.6349841939164.6352799761.492M92.6781875659164.63205881.534M93.7213909378164.6280613671.573M94.7645943098164.6243435841.607M95.8077976818164.6210863911.632M96.8510010537164.6186731061.644M97.8942044257164.6175813721.655M98.9374077977164.6165089811.666M99.9806111697164.6154000961.678M101.023814542164.6142818751.69M102.067017914164.6131866161.702M103.110221286164.6120117971.713M104.153424658164.6109313061.724M105.19662803164.6098976941.737M106.239831401164.6086983881.749M107.283034773164.607576451.76M108.326238145164.6064394581.773M109.369441517164.6052694991.785M110.412644889164.6041154511.794M111.455848261164.6032865991.794M112.499051633164.6032865991.794M113.542255005164.6032771661.809M114.585458377164.6018048791.828M115.628661749164.5999675691.845M116.671865121164.5983499771.86M117.715068493164.5969106571.878M118.758271865164.5952085521.894M119.801475237164.5936751891.911M120.844678609164.5920488321.928M121.887881981164.5904322891.946M122.931085353164.5888051691.962M123.974288725164.5872496061.976M125.017492097164.5858829861.99M126.060695469164.5845848732.002M127.103898841164.5834259662.014M128.147102213164.5822889742.025M129.190305585164.5812705122.035M130.233508957164.5802560512.047M131.276712329164.5791026712.059M132.319915701164.5780277062.073M133.363119073164.5766827152.086M134.406322445164.5754299562.105M135.449525817164.5735943612.128M136.492729189164.571426522.149M137.535932561164.5694462882.171M138.579135933164.567275972.191M139.622339305164.5654622892.213M140.665542677164.5633461862.234M141.708746048164.5612910632.255M142.75194942164.5593370332.277M143.795152792164.5572014932.299M144.838356164164.5551258842.322M145.881559536164.5529294592.344M146.924762908164.5508077342.367M147.96796628164.5486734322.38M149.011169652164.5474513532.401M150.054373024164.5453997562.42M151.097576396164.543557112.439M152.140779768164.5417421912.461M153.18398314164.539721752.481M154.227186512164.5378159332.499M155.270389884164.5360754282.519M156.313593256164.5341232092.54M157.356796628164.532137262.563M158.4164.5299692292.586M159.443203372164.5278228262.607M160.486406744164.5257925712.628M161.529610116164.523795762.651M162.572813488164.5215930462.671M163.61601686164.5196516892.693M164.659220232164.5176288662.716M165.702423604164.5154364422.737M166.745626976164.5134245772.758M167.788830348164.5113999442.779M168.83203372164.5093975112.791M169.875237092164.5082464172.804M170.918440464164.5070233862.814M171.961643836164.5060123562.828M173.004847208164.5047513072.841M174.04805058164.5034574822.855M175.091253952164.5021380262.868M176.134457323164.5008905072.881M177.177660695164.4996291732.894M178.220864067164.4983975672.908M179.264067439164.4971334692.92M180.307270811164.4959192042.933M181.350474183164.4947369532.944M182.393677555164.4936781872.957M183.436880927164.4924378142.97M184.480084299164.4912282182.982M185.523287671164.4900776952.994M186.566491043164.4889461343.005M187.609694415164.487898233.016M188.652897787164.4867734343.027M189.696101159164.4857569733.038M190.739304531164.484672483.05M191.782507903164.483608953.061M192.825711275164.4825547573.072M193.868914647164.4814331053.084M194.912118019164.4803674793.094M195.955321391164.4793694073.103M196.998524763164.4785634223.113M198.041728135164.4775888843.124M199.084931507164.4765336443.136M200.128134879164.4753895053.149M201.171338251164.4741222643.16M202.214541623164.4731305753.172M203.257744995164.4719277443.185M204.300948367164.4707456833.196M205.344151739164.4696126933.209M206.387355111164.4684416853.221M207.430558483164.4672386633.232M208.473761855164.4661979053.245M209.516965227164.4650243253.256M210.560168599164.4639447873.266M211.60337197164.4629671063.277M212.646575342164.461913773.288M213.689778714164.4608764423.299M214.732982086164.4598542643.31M215.776185458164.4587629123.321M216.81938883164.4577080523.332M217.862592202164.4567062643.341M218.905795574164.4558119533.351M219.948998946164.4548733373.362M220.992202318164.4538389633.373M222.03540569164.4527570433.384M223.078609062164.4517562083.394M224.121812434164.4507935813.4M225.165015806164.4502576233.405M226.208219178164.4497446283.412M227.25142255164.4490465023.419M228.294625922164.4484408943.426M229.337829294164.4477580123.434M230.381032666164.4470253943.44M231.424236038164.4463818643.448M232.46743941164.445689743.454M233.510642782164.4450968053.461M234.553846154164.4444389823.468M235.597049526164.443759153.474M236.640252898164.4432091863.48M237.68345627164.4426458833.485M238.726659642164.4421015413.492M239.769863014164.441464683.498M240.813066386164.440865363.505M241.856269758164.4402248793.512M242.89947313164.4395632463.52M243.942676502164.4388179553.527M244.985879874164.4381375513.533M246.029083246164.4375496653.539M247.072286617164.4369418663.546M248.115489989164.4363347333.552M249.158693361164.4357577093.558M250.201896733164.4351476233.564M251.245100105164.4346123323.571M252.288303477164.4339445053.577M253.331506849164.4333363243.584M254.374710221164.4326671643.591M255.417913593164.43206473.597M256.461116965164.431483963.603M257.504320337164.4308890233.609M258.547523709164.4302999943.616M259.590727081164.4296270223.623M260.633930453164.4289954023.63M261.677133825164.4283271943.636M262.720337197164.4277228243.642M263.763540569164.4271368443.65M264.806743941164.4263986043.657M265.849947313164.4257770843.664M266.893150685164.4250922023.671M267.936354057164.4243868343.678M268.979557429164.423719963.685M270.022760801164.4230741433.691M271.065964173164.4224555773.698M272.109167545164.4218034713.705M273.152370917164.4211713753.712M274.195574289164.4204683893.719M275.238777661164.4198602083.724M276.281981033164.4193632213.731M277.325184405164.4187288383.737M278.368387777164.4180921683.743M279.411591149164.4175006613.75M280.454794521164.4168883833.757M281.497997893164.4162384693.764M282.541201264164.4155251933.771M283.584404636164.4148598443.778M284.627608008164.4141734373.785M285.67081138164.4135538223.791M286.714014752164.4129485953.797M287.757218124164.4123801473.803M288.800421496164.4117899743.81M289.843624868164.4111114763.818M290.88682824164.4104274513.825M291.930031612164.409753243.832M292.973234984164.4090988483.839M294.016438356164.4084080583.846M295.059641728164.4077436623.853M296.1028451164.407043823.86M297.146048472164.4064292563.866M298.189251844164.4058169783.872M299.232455216164.4052496733.878M300.275658588164.4047045683.884M301.31886196164.4041172543.891M302.362065332164.4034345633.898M303.405268704164.4028140913.904M304.448472076164.4021800893.911M305.491675448164.4015645723.917M306.53487882164.4009284733.925M307.578082192164.4002214863.931M308.621285564164.3996372213.937M309.664488936164.3990608643.938M310.707692308164.3989480511.676M6.09230769231164.6144458544.637M7.13551106428164.3323611647.457M8.17871443625164.06365311610.197M9.22191780822163.80259911413.095M10.2651211802163.52647741516.236M11.3083245522163.22721940719.259M12.3515279241162.93914171222.155M13.3947312961162.6632575525.204M14.4379346681162.37269062928.322M15.48113804162.07557673731.902M16.524341412161.73447259535.08M17.567544784161.43171709538.512M18.610748156161.1046875342.142M19.6539515279160.75883866345.474M20.6971548999160.44134209248.446M21.7403582719160.15818548751.679M22.7835616438159.85017912755.205M23.8267650158159.51418663858.864M24.8699683878159.1655435562.142M25.9131717597158.85317568766.01M26.9563751317158.48468568769.802M27.9995785037158.12331253674.147M29.0427818757157.70932769878.137M30.0859852476157.32920641981.691M31.1291886196156.99052261385.785M32.1723919916156.60044891290.072M33.2155953635156.19202240694.337M34.2587987355155.78560404999.124M35.3020021075155.329465603104.299M36.3452054795154.83640097109.156M37.3884088514154.373601596113.461M38.4316122234153.963437921117.911M39.4748155954153.539429104122.504M40.5180189673153.101807824127.405M41.5612223393152.63481874132.253M42.6044257113152.172965605134.174M43.6476290832151.989930535138.768M44.6908324552151.552141369146.126M45.7340358272150.851133981152.104M46.7772391992150.281487543156.951M47.8204425711149.819686718161.482M48.8636459431149.38799451167.96M49.9068493151148.770741895173.504M50.950052687148.242476143173.504M51.993256059148.242476143178.971M53.036459431147.721561537184.514M54.079662803147.193399927189.366M55.1228661749146.731136607195.195M56.1660695469146.175739589200.782M57.2092729189145.643390174207.578M58.2524762908144.995906029214.976M59.2956796628144.290939891221.737M60.3388830348143.646802408228.394M61.3820864067143.012538549234.837M62.4252897787142.398557896239.419M63.4684931507141.961968131243.909M64.5116965227141.534196655248.073M65.5548998946141.13740743252.523M66.5981032666140.713486082256.659M67.6413066386140.319346155261.395M68.6845100105139.868141473265.954M69.7277133825139.433730697270.573M70.7709167545138.99365173274.561M71.8141201264138.613677566278.761M72.8573234984138.213483558282.037M73.9005268704137.901299588285.972M74.9437302424137.526420021290.223M75.9869336143137.121296824294.516M77.0301369863136.712308348299.01M78.0733403583136.284135261303.862M79.1165437302135.821776564308.3M80.1597471022135.398950665313.397M81.2029504742134.913299111318.263M82.2461538462134.449661166323.307M83.2893572181133.969047326328.272M84.3325605901133.496013117333.528M85.3757639621132.995137701338.381M86.418967334132.53276614343.707M87.462170706132.025295445348.887M88.505374078131.531797091355.405M89.5485774499130.910747134363.722M90.5917808219130.118223449372.46M91.6349841939129.285728099381.581M92.6781875659128.416585165391.011M93.7213909378127.518087956400.052M94.7645943098126.656703209409.084M95.8077976818125.796140835417.682M96.8510010537124.976881588422.894M97.8942044257124.480267629428.038M98.9374077977123.99010517433.642M99.9806111697123.456227824439.328M101.023814542122.914409921444.71M102.067017914122.401656066451.239M103.110221286121.779511612457.157M104.153424658121.215636842462.629M105.19662803120.694236492469.491M106.239831401120.040426238475.766M107.283034773119.442551275482.478M108.326238145118.80302779489.338M109.369441517118.149402382495.606M110.412644889117.552220972499.768M111.455848261117.15559001499.768M112.499051633117.15559001499.797M113.542255005117.152819705506.16M114.585458377116.546587044515.314M115.628661749115.674341463523.505M116.671865121114.893944806531.011M117.715068493114.178760888539.654M118.758271865113.355250854547.837M119.801475237112.575571475556.758M120.844678609111.725525589565.807M121.887881981110.863296268574.988M122.931085353109.988525351584.061M123.974288725109.124036909589.277M125.017492097108.627054212594.636M126.060695469108.116495544599.966M127.103898841107.60864744605.3M128.147102213107.100360088610.027M129.190305585106.64996806614.912M130.233508957106.184496902620.811M131.276712329105.622457155626.403M132.319915701105.089634192633.502M133.363119073104.413258077640.282M134.406322445103.767217349650.12M135.449525817102.829843163661.786M136.492729189101.718317557672.361M137.535932561100.710741604684.408M138.57913593399.5628367257694.737M139.62233930598.5786653954707.267M140.66554267797.3848007057719.681M141.70874604896.2020286695731.517M142.7519494295.0742214837744.745M143.79515279293.8138630923757.109M144.83835616492.6358173916771.319M145.88155953691.281842322785.695M146.92476290889.9120812308800.923M147.9679662888.4611659747809.497M149.01116965287.6441730893822.93M150.05437302486.3642592512835.905M151.09757639685.1280243796849.267M152.14077976883.8549273496863.941M153.1839831482.456754602876.708M154.22718651281.2402885119889.476M155.27038988480.0237198991902.867M156.31359325678.7477958718915.074M157.35679662877.5847362106928.107M158.476.3429342409940.99M159.44320337275.1153947443953.821M160.48640674473.8928833391966.59M161.52961011672.676169327980.592M162.57281348871.3420936696993.349M163.6160168670.12655895471.007G164.65922023268.78456579921.022G165.70242360467.40020827391.035G166.74562697666.17418679951.048G167.78883034864.88167400541.062G168.8320337263.58670781211.071G169.87523709262.76567347341.08G170.91844046461.85445553851.088G171.96164383661.12519370771.098G173.00484720860.18920157671.108G174.0480505859.21465499551.117G175.09125395258.31365417371.127G176.13445732357.38851372971.137G177.17766069556.42717056931.147G178.22086406755.47939585271.157G179.26406743954.54307557281.167G180.30727081153.57815469351.177G181.35047418352.61007981141.186G182.39367755551.75671241751.196G183.43688092750.80780194731.206G184.48008429949.86156735831.216G185.52328767148.93604998121.225G186.56649104348.0237990061.235G187.60969441547.13912502281.244G188.65289778746.27177337741.252G189.69610115945.45923822941.261G190.73930453144.5820247751.27G191.78250790343.73624006061.278G192.82571127543.02910874981.287G193.86891464742.14858903291.296G194.91211801941.28638153181.305G195.95532139140.46063705541.312G196.99852476339.81081003781.319G198.04172813539.08152048021.328G199.08493150738.26224493981.337G200.12813487937.34975623761.348G201.17133825136.36044724221.355G202.21454162335.63880115351.365G203.25774499534.68175041751.375G204.30094836733.74891467141.384G205.34415173932.87118040921.395G206.38735511131.88843544041.404G207.43055848330.98305072431.414G208.47376185530.0714512831.424G209.51696522729.08922721731.434G210.56016859928.17910025371.442G211.6033719727.35858786621.452G212.64657534226.46401157931.461G213.68977871425.53535315771.472G214.73298208624.50591633131.483G215.77618545823.46507299481.494G216.8193888322.47086882541.503G217.86259220221.60061043221.512G218.90579557420.73629578421.522G219.94899894619.79733563951.532G220.99220231818.78480400941.543G222.0354056917.79132874211.553G223.07860906216.82760974171.562G224.12181243415.90168389351.564G225.16501580615.78762479471.565G226.20821917815.67829184091.566G227.2514225515.5372767941.568G228.29462592215.41408240631.569G229.33782929415.27287422411.571G230.38103266615.13411982221.572G231.42423603815.00739297421.573G232.4674394114.87310907741.575G233.51064278214.73975169641.576G234.55384615414.60128742611.578G235.59704952614.46435385061.579G236.64025289814.33762119041.58G237.6834562714.20981423321.581G238.72665964214.09264229311.583G239.76986301413.94327269251.585G240.81306638613.79912736781.586G241.85626975813.64696240331.588G242.8994731313.49511729841.589G243.94267650213.33037929431.591G244.98587987413.18069154941.593G246.02908324613.03103943971.594G247.07228661712.87803828621.596G248.11548998912.72827088611.597G249.15869336112.58402408671.599G250.20189673312.43893871631.6G251.24510010512.30945471221.602G252.28830347712.1396464071.604G253.33150684911.98777500441.605G254.37471022111.82827699761.607G255.41791359311.67999074581.608G256.46111696511.53516063411.61G257.50432033711.39558986171.611G258.54752370911.23695472271.613G259.59072708111.07617699221.615G260.63393045310.91454439761.616G261.67713382510.75339983391.618G262.72033719710.60505793791.62G263.76354056910.45525671291.621G264.80674394110.28698281841.623G265.84994731310.13488464621.625G266.8931506859.967071818081.627G267.9363540579.798683109571.628G268.9795574299.630384156091.63G270.0227608019.474074834541.632G271.0659641739.313944540761.633G272.1091675459.149626536881.635G273.1523709178.988307227281.637G274.1955742898.809968383881.638G275.2387776618.65966874221.64G276.2819810338.524570475811.642G277.3251844058.346130348351.644G278.3683877778.176664674871.645G279.4115911498.011835772451.647G280.4547945217.85114065091.649G281.4979978937.689681468291.65G282.5412012647.527906809181.652G283.5844046367.354841406741.654G284.6276080087.193014819291.655G285.670811387.038832367781.657G286.7140147526.877492096291.659G287.7572181246.731265160131.66G288.8004214966.587787185871.662G289.8436248686.412920585011.664G290.886828246.241577392761.666G291.9300316126.073567141731.667G292.9732349845.902200605561.669G294.0164383565.736049769851.671G295.0596417285.568769850281.673G296.10284515.403507608331.674G297.1460484725.236017212281.676G298.1892518445.07506235381.678G299.2324552164.923061368531.679G300.2756585884.787280316071.681G301.318861964.638508510651.683G302.3620653324.454105867591.684G303.4052687044.28786508631.686G304.4484720764.120216046831.688G305.4916754483.955979317921.69G306.534878823.79169113711.692G307.5780821923.601572375991.693G308.6212855643.436381880891.695G309.6644889363.26288733131.695G310.7076923083.2307692307751.925k91.041k118.418k148.113k171.386k195.482k216.697k235.63k255.662k276.115k298.019k316.206k334.914k353.556k371.378k387.398k404.005k422.061k439.214k454.576k473.222k490.952k510.241k528.058k544.46k562.775k582.928k603.038k623.867k646.212k666.459k683.61k702.196k720.154k739.217k757.271k764.619k783.204k812.395k838.632k862.399k884.67k911.017k935.195k935.195k952.408k968.59k983.132k1M1.017M1.035M1.052M1.068M1.083M1.099M1.111M1.124M1.134M1.145M1.156M1.167M1.178M1.189M1.199M1.21M1.218M1.228M1.238M1.249M1.259M1.269M1.279M1.29M1.301M1.312M1.324M1.336M1.348M1.36M1.372M1.392M1.422M1.458M1.492M1.534M1.573M1.607M1.632M1.644M1.655M1.666M1.678M1.69M1.702M1.713M1.724M1.737M1.749M1.76M1.773M1.785M1.794M1.794M1.794M1.809M1.828M1.845M1.86M1.878M1.894M1.911M1.928M1.946M1.962M1.976M1.99M2.002M2.014M2.025M2.035M2.047M2.059M2.073M2.086M2.105M2.128M2.149M2.171M2.191M2.213M2.234M2.255M2.277M2.299M2.322M2.344M2.367M2.38M2.401M2.42M2.439M2.461M2.481M2.499M2.519M2.54M2.563M2.586M2.607M2.628M2.651M2.671M2.693M2.716M2.737M2.758M2.779M2.791M2.804M2.814M2.828M2.841M2.855M2.868M2.881M2.894M2.908M2.92M2.933M2.944M2.957M2.97M2.982M2.994M3.005M3.016M3.027M3.038M3.05M3.061M3.072M3.084M3.094M3.103M3.113M3.124M3.136M3.149M3.16M3.172M3.185M3.196M3.209M3.221M3.232M3.245M3.256M3.266M3.277M3.288M3.299M3.31M3.321M3.332M3.341M3.351M3.362M3.373M3.384M3.394M3.4M3.405M3.412M3.419M3.426M3.434M3.44M3.448M3.454M3.461M3.468M3.474M3.48M3.485M3.492M3.498M3.505M3.512M3.52M3.527M3.533M3.539M3.546M3.552M3.558M3.564M3.571M3.577M3.584M3.591M3.597M3.603M3.609M3.616M3.623M3.63M3.636M3.642M3.65M3.657M3.664M3.671M3.678M3.685M3.691M3.698M3.705M3.712M3.719M3.724M3.731M3.737M3.743M3.75M3.757M3.764M3.771M3.778M3.785M3.791M3.797M3.803M3.81M3.818M3.825M3.832M3.839M3.846M3.853M3.86M3.866M3.872M3.878M3.884M3.891M3.898M3.904M3.911M3.917M3.925M3.931M3.937M3.938M1.676M4.637M7.457M10.197M13.095M16.236M19.259M22.155M25.204M28.322M31.902M35.08M38.512M42.142M45.474M48.446M51.679M55.205M58.864M62.142M66.01M69.802M74.147M78.137M81.691M85.785M90.072M94.337M99.124M104.299M109.156M113.461M117.911M122.504M127.405M132.253M134.174M138.768M146.126M152.104M156.951M161.482M167.96M173.504M173.504M178.971M184.514M189.366M195.195M200.782M207.578M214.976M221.737M228.394M234.837M239.419M243.909M248.073M252.523M256.659M261.395M265.954M270.573M274.561M278.761M282.037M285.972M290.223M294.516M299.01M303.862M308.3M313.397M318.263M323.307M328.272M333.528M338.381M343.707M348.887M355.405M363.722M372.46M381.581M391.011M400.052M409.084M417.682M422.894M428.038M433.642M439.328M444.71M451.239M457.157M462.629M469.491M475.766M482.478M489.338M495.606M499.768M499.768M499.797M506.16M515.314M523.505M531.011M539.654M547.837M556.758M565.807M574.988M584.061M589.277M594.636M599.966M605.3M610.027M614.912M620.811M626.403M633.502M640.282M650.12M661.786M672.361M684.408M694.737M707.267M719.681M731.517M744.745M757.109M771.319M785.695M800.923M809.497M822.93M835.905M849.267M863.941M876.708M889.476M902.867M915.074M928.107M940.99M953.821M966.59M980.592M993.349M1.007G1.022G1.035G1.048G1.062G1.071G1.08G1.088G1.098G1.108G1.117G1.127G1.137G1.147G1.157G1.167G1.177G1.186G1.196G1.206G1.216G1.225G1.235G1.244G1.252G1.261G1.27G1.278G1.287G1.296G1.305G1.312G1.319G1.328G1.337G1.348G1.355G1.365G1.375G1.384G1.395G1.404G1.414G1.424G1.434G1.442G1.452G1.461G1.472G1.483G1.494G1.503G1.512G1.522G1.532G1.543G1.553G1.562G1.564G1.565G1.566G1.568G1.569G1.571G1.572G1.573G1.575G1.576G1.578G1.579G1.58G1.581G1.583G1.585G1.586G1.588G1.589G1.591G1.593G1.594G1.596G1.597G1.599G1.6G1.602G1.604G1.605G1.607G1.608G1.61G1.611G1.613G1.615G1.616G1.618G1.62G1.621G1.623G1.625G1.627G1.628G1.63G1.632G1.633G1.635G1.637G1.638G1.64G1.642G1.644G1.645G1.647G1.649G1.65G1.652G1.654G1.655G1.657G1.659G1.66G1.662G1.664G1.666G1.667G1.669G1.671G1.673G1.674G1.676G1.678G1.679G1.681G1.683G1.684G1.686G1.688G1.69G1.692G1.693G1.695G1.695G# Types# Tokens \ No newline at end of file diff --git a/docs/source/guide/overview.rst b/docs/source/guide/overview.rst index 44d750490..59d0810d8 100644 --- a/docs/source/guide/overview.rst +++ b/docs/source/guide/overview.rst @@ -1,106 +1,71 @@ -Don't Settle for a List of Strings -================================== +Overview +======== +What and Why +------------ - *"Other NLP tokenizers return lists of strings, which is downright - barbaric."* --- me +spaCy is a lightning-fast, full-cream NLP tokenizer and lexicon. +Most tokenizers give you a sequence of strings. That's barbaric. +Giving you strings invites you to compute on every *token*, when what +you should be doing is computing on every *type*. Remember +`Zipf's law `_: you'll +see exponentially fewer types than tokens. -spaCy splits text into a list of lexical types, which come with a variety of -features pre-computed. It's designed to **make the right thing easy**, where the right -thing is: +Instead of strings, spaCy gives you references to Lexeme objects, from which you +can access an excellent set of pre-computed orthographic and distributional features: -* A global vocabulary store; +:: -* Cached orthographic features; + >>> from spacy import en + >>> apples, are, nt, oranges, dots = en.EN.tokenize(u"Apples aren't oranges...") + >>> are.prob >= oranges.prob + True + >>> apples.check_flag(en.IS_TITLE) + True + >>> apples.check_flag(en.OFT_TITLE) + False + >>> are.check_flag(en.CAN_NOUN) + False -* Clever use of distributional data. - -Let's say you're writing an entity tagger for English. Case distinctions are an -important feature here: you need to know whether the word you're tagging is -upper-cased, lower-cased, title-cased, non-alphabetic, etc. -The right thing is to call the string.isupper(), string.islower(), string.isalpha() -etc functions once for every *type* in your vocabulary, instead -of once for every *token* in the text you're tagging. -When you encounter a new word, you want to create a lexeme object, calculate its -features, and save it. +spaCy makes it easy to write very efficient NLP applications, because your feature +functions have to do almost no work: almost every lexical property you'll want +is pre-computed for you. See the tutorial for an example POS tagger. -That's the *right* way to do it, so it's what spaCy does for you. +Benchmark +--------- -Other tokenizers give you a list of strings, which makes it really easy to do -the wrong thing. And the wrong thing isn't just a little bit worse: it's -**exponentially** worse, because of -`Zipf's law `_. - -.. raw:: html - -
-
- -
-
- -Over the Gigaword corpus, if you compute some feature on a per-token basis, you'll -make **500x more calls** to that function than if you had computed it on a per-token -basis. -(Mouse-over a line to see its value at that point. And yes, it's a bit snarky -to present the graph in a linear scale --- but it isn't misleading.) - -Zipf's Law also makes distributional information a really powerful source of -type-based features. It's really handy to know where a word falls in the language's -frequency distribution, especially compared to variants of the word. For instance, -we might be processing a Twitter comment that contains the string "nasa". We have -little hope of recognising this as an entity except by noting that the string "NASA" -is much more common, and that both strings are quite rare. - -.. Each spaCy Lexeme comes with a rich, curated set of orthographic and -.. distributional features. Different languages get a different set of features, -.. to take into account different orthographic conventions and morphological -.. complexity. It's also easy to define your own features. - -.. And, of course, we take care to get the details right. Indices into the original -.. text are always easy to calculate, so it's easy to, say, mark entities with in-line -.. mark-up. You'll also receive tokens for newlines, tabs and other non-space whitespace, -.. making it easy to do paragraph and sentence recognition. And, of course, we deal -.. smartly with all the random unicode whitespace and punctuation characters you might -.. not have thought of. - - -Benchmarks ----------- - -We here ask two things: - -1. How fast is the spaCy tokenizer itself, relative to other tokenizers? - -2. How fast are applications using spaCy's pre-computed lexical features, - compared to applications that re-compute their features on every token? +The tokenizer itself is also very efficient: +--------+-------+--------------+--------------+ | System | Time | Words/second | Speed Factor | +--------+-------+--------------+--------------+ | NLTK | 6m4s | 89,000 | 1.00 | +--------+-------+--------------+--------------+ -| spaCy | | | | +| spaCy | 9.5s | 3,093,000 | 38.30 | +--------+-------+--------------+--------------+ - -spaCy uses more memory than a standard tokenizer, but is far more efficient. We -compare against the NLTK tokenizer and the Penn Treebank's tokenizer.sed script. -We also give the performance of Python's native string.split, for reference. - +The comparison refers to 30 million words from the English Gigaword, on +a Maxbook Air. For context, calling string.split() on the data completes in +about 5s. Pros and Cons ------------- Pros: -- Stuff +- All tokens come with indices into the original string +- Full unicode support +- Extensible to other languages +- Batch operations computed efficiently in Cython +- Cython API +- numpy interoperability Cons: - It's new (released September 2014) +- Security concerns, from memory management - Higher memory usage (up to 1gb) -- More complicated +- More conceptually complicated +- Tokenization rules expressed in code, not as data + diff --git a/docs/source/api/index.rst b/docs/source/how/api/index.rst similarity index 100% rename from docs/source/api/index.rst rename to docs/source/how/api/index.rst diff --git a/docs/source/api/lexicon.rst b/docs/source/how/api/lexicon.rst similarity index 100% rename from docs/source/api/lexicon.rst rename to docs/source/how/api/lexicon.rst diff --git a/docs/source/api/tokenizers/en.rst b/docs/source/how/api/tokenizers/en.rst similarity index 100% rename from docs/source/api/tokenizers/en.rst rename to docs/source/how/api/tokenizers/en.rst diff --git a/docs/source/api/tokenizers/index.rst b/docs/source/how/api/tokenizers/index.rst similarity index 100% rename from docs/source/api/tokenizers/index.rst rename to docs/source/how/api/tokenizers/index.rst diff --git a/docs/source/how/index.rst b/docs/source/how/index.rst new file mode 100644 index 000000000..bd995f8c8 --- /dev/null +++ b/docs/source/how/index.rst @@ -0,0 +1,13 @@ +How +=== + +Tutorial +-------- + +Installation +------------ + +API +--- + + diff --git a/docs/source/index.rst b/docs/source/index.rst index 6bced7b15..20e06360d 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -6,67 +6,42 @@ spaCy NLP Tokenizer and Lexicon ================================ -spaCy splits a string of natural language into a list of references to lexical types: +spaCy is an industrial-strength multi-language tokenizer, bristling with features +you never knew you wanted. You do want these features though --- your current +tokenizer has been doing it wrong. +Where other tokenizers give you a list of strings, spaCy gives you references +to rich lexical types, for easy, excellent and efficient feature extraction. + +* **Easy**: Tokenizer returns a sequence of rich lexical types, with features + pre-computed: >>> from spacy.en import EN - >>> tokens = EN.tokenize(u"Examples aren't easy, are they?") - >>> type(tokens[0]) - spacy.word.Lexeme - >>> tokens[1] is tokens[5] - True + >>> for w in EN.tokenize(string): + ... print w.sic, w.shape, w.cluster, w.oft_title, w.can_verb -Other tokenizers return lists of strings, which is -`downright barbaric `__. If you get a list of strings, -you have to write all the features yourself, and you'll probably compute them -on a per-token basis, instead of a per-type basis. At scale, that's very -inefficient. +Check out the tutorial and API docs. -spaCy's tokens come with the following orthographic and distributional features -pre-computed: +* **Excellent**: Distributional and orthographic features are crucial to robust + NLP. Without them, models can only learn from tiny annotated training + corpora. Read more. + +* **Efficient**: spaCy serves you rich lexical objects faster than most + tokenizers can give you a list of strings. -* Orthographic flags, such as is_alpha, is_digit, is_punct, is_title etc; ++--------+-------+--------------+--------------+ +| System | Time | Words/second | Speed Factor | ++--------+-------+--------------+--------------+ +| NLTK | 6m4s | 89,000 | 1.00 | ++--------+-------+--------------+--------------+ +| spaCy | 9.5s | 3,093,000 | 38.30 | ++--------+-------+--------------+--------------+ -* Useful string transforms, such as canonical casing, word shape, ASCIIfied, - etc; -* Unigram log probability; - -* Brown cluster; - -* can_noun, can_verb etc tag-dictionary; - -* oft_upper, oft_title etc case-behaviour flags. - -The features are up-to-date with current NLP research, but you can replace or -augment them if you need to. .. toctree:: + :hidden: :maxdepth: 3 - guide/overview.rst - guide/install.rst - - api/index.rst - - modules/index.rst - -License -======= - -+------------------+------+ -| Non-commercial | $0 | -+------------------+------+ -| Trial commercial | $0 | -+------------------+------+ -| Full commercial | $500 | -+------------------+------+ - -spaCy is non-free software. Its source is published, but the copyright is -retained by the author (Matthew Honnibal). Licenses are currently under preparation. - -There is currently a gap between the output of academic NLP researchers, and -the needs of a small software companiess. I left academia to try to correct this. -My idea is that non-commercial and trial commercial use should "feel" just like -free software. But, if you do use the code in a commercial product, a small -fixed license-fee will apply, in order to fund development. - + what/index.rst + why/index.rst + how/index.rst diff --git a/docs/source/what/index.rst b/docs/source/what/index.rst new file mode 100644 index 000000000..8f263fe5f --- /dev/null +++ b/docs/source/what/index.rst @@ -0,0 +1,31 @@ +What +==== + +Overview +-------- + +Feature List +------------ + +License (for the code) +------- + ++------------------+------+ +| Non-commercial | $0 | ++------------------+------+ +| Trial commercial | $0 | ++------------------+------+ +| Full commercial | $500 | ++------------------+------+ + +spaCy is non-free software. Its source is published, but the copyright is +retained by the author (Matthew Honnibal). Licenses are currently under preparation. + +There is currently a gap between the output of academic NLP researchers, and +the needs of a small software companiess. I left academia to try to correct this. +My idea is that non-commercial and trial commercial use should "feel" just like +free software. But, if you do use the code in a commercial product, a small +fixed license-fee will apply, in order to fund development. + +Pricing (for the data) +---------------------- diff --git a/docs/source/why/index.rst b/docs/source/why/index.rst new file mode 100644 index 000000000..8f1f78272 --- /dev/null +++ b/docs/source/why/index.rst @@ -0,0 +1,28 @@ +Why +=== + +Benchmarks +---------- + +Efficiency +---------- + ++--------+-------+--------------+--------------+ +| System | Time | Words/second | Speed Factor | ++--------+-------+--------------+--------------+ +| NLTK | 6m4s | 89,000 | 1.00 | ++--------+-------+--------------+--------------+ +| spaCy | 9.5s | 3,093,000 | 38.30 | ++--------+-------+--------------+--------------+ + + +Accuracy +-------- + +The comparison refers to 30 million words from the English Gigaword, on +a Maxbook Air. For context, calling string.split() on the data completes in +about 5s. + +Pros and Cons +------------- +