Merge remote-tracking branch 'refs/remotes/honnibal/master'

Conflicts:
	setup.py
This commit is contained in:
maxirmx 2015-10-10 17:38:06 +03:00
commit 8e03239ac5
40 changed files with 5315 additions and 292 deletions

1
.gitignore vendored
View File

@ -91,3 +91,4 @@ coverage.xml
# Sphinx documentation # Sphinx documentation
docs/_build/ docs/_build/
docs/_themes/ docs/_themes/
setup.py

View File

@ -24,4 +24,4 @@ install:
# run tests # run tests
script: script:
- "py.test tests/ website/tests/ -x" - "py.test tests/ -x"

View File

@ -27,8 +27,8 @@ from pathlib import Path
from shutil import copyfile from shutil import copyfile
from shutil import copytree from shutil import copytree
import codecs
from collections import defaultdict from collections import defaultdict
import io
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.vocab import write_binary_vectors from spacy.vocab import write_binary_vectors
@ -61,7 +61,7 @@ def _read_clusters(loc):
print("Warning: Clusters file not found") print("Warning: Clusters file not found")
return {} return {}
clusters = {} clusters = {}
for line in codecs.open(str(loc), 'r', 'utf8'): for line in io.open(str(loc), 'r', encoding='utf8'):
try: try:
cluster, word, freq = line.split() cluster, word, freq = line.split()
except ValueError: except ValueError:
@ -88,7 +88,7 @@ def _read_probs(loc):
print("Probabilities file not found. Trying freqs.") print("Probabilities file not found. Trying freqs.")
return {}, 0.0 return {}, 0.0
probs = {} probs = {}
for i, line in enumerate(codecs.open(str(loc), 'r', 'utf8')): for i, line in enumerate(io.open(str(loc), 'r', encoding='utf8')):
prob, word = line.split() prob, word = line.split()
prob = float(prob) prob = float(prob)
probs[word] = prob probs[word] = prob

View File

@ -1,11 +1,11 @@
import codecs import io
import plac import plac
from spacy.en import English from spacy.en import English
def main(text_loc): def main(text_loc):
with codecs.open(text_loc, 'r', 'utf8') as file_: with io.open(text_loc, 'r', encoding='utf8') as file_:
text = file_.read() text = file_.read()
NLU = English() NLU = English()
for paragraph in text.split('\n\n'): for paragraph in text.split('\n\n'):

View File

@ -6,7 +6,7 @@ from __future__ import print_function
import os import os
from os import path from os import path
import shutil import shutil
import codecs import io
import random import random
import plac import plac
@ -177,7 +177,7 @@ def write_parses(Language, dev_loc, model_dir, out_loc):
nlp = Language(data_dir=model_dir) nlp = Language(data_dir=model_dir)
gold_tuples = read_json_file(dev_loc) gold_tuples = read_json_file(dev_loc)
scorer = Scorer() scorer = Scorer()
out_file = codecs.open(out_loc, 'w', 'utf8') out_file = io.open(out_loc, 'w', 'utf8')
for raw_text, sents in gold_tuples: for raw_text, sents in gold_tuples:
sents = _merge_sents(sents) sents = _merge_sents(sents)
for annot_tuples, brackets in sents: for annot_tuples, brackets in sents:
@ -229,7 +229,6 @@ def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbos
print('POS', scorer.tags_acc) print('POS', scorer.tags_acc)
print('UAS', scorer.uas) print('UAS', scorer.uas)
print('LAS', scorer.las) print('LAS', scorer.las)
print('SBD', scorer.sbd_acc)
print('NER P', scorer.ents_p) print('NER P', scorer.ents_p)
print('NER R', scorer.ents_r) print('NER R', scorer.ents_r)

View File

@ -27,7 +27,7 @@ import json
from os import path from os import path
import os import os
import re import re
import codecs import io
from collections import defaultdict from collections import defaultdict
from spacy.munge import read_ptb from spacy.munge import read_ptb
@ -122,7 +122,7 @@ def read_file(*pieces):
if not path.exists(loc): if not path.exists(loc):
return None return None
else: else:
return codecs.open(loc, 'r', 'utf8').read().strip() return io.open(loc, 'r', encoding='utf8').read().strip()
def get_file_names(section_dir, subsection): def get_file_names(section_dir, subsection):

View File

@ -22,74 +22,77 @@ our pattern set stays very small (exact size depends on the maximum length we're
looking for, as the query language currently has no quantifiers) looking for, as the query language currently has no quantifiers)
""" """
from __future__ import print_function, unicode_literals, division from __future__ import print_function, unicode_literals, division
from ast import literal_eval
from bz2 import BZ2File
import time
import math
import codecs
import plac import plac
from preshed.maps import PreshMap from preshed.maps import PreshMap
from preshed.counter import PreshCounter
from spacy.strings import hash_string from spacy.strings import hash_string
from spacy.en import English from spacy.en import English
from spacy.matcher import Matcher from spacy.matcher import PhraseMatcher
from spacy.attrs import FLAG63 as U_ENT
from spacy.attrs import FLAG62 as L_ENT
from spacy.attrs import FLAG61 as I_ENT
from spacy.attrs import FLAG60 as B_ENT
def get_bilou(length): def read_gazetteer(tokenizer, loc, n=-1):
if length == 1: for i, line in enumerate(open(loc)):
return [U_ENT] phrase = literal_eval('u' + line.strip())
else: if ' (' in phrase and phrase.endswith(')'):
return [B_ENT] + [I_ENT] * (length - 2) + [L_ENT] phrase = phrase.split(' (', 1)[0]
if i >= n:
break
phrase = tokenizer(phrase)
if all((t.is_lower and t.prob >= -10) for t in phrase):
continue
if len(phrase) >= 2:
yield phrase
def make_matcher(vocab, max_length): def read_text(bz2_loc):
abstract_patterns = [] with BZ2File(bz2_loc) as file_:
for length in range(1, max_length+1): for line in file_:
abstract_patterns.append([{tag: True} for tag in get_bilou(length)]) yield line.decode('utf8')
return Matcher(vocab, {'Candidate': ('CAND', {}, abstract_patterns)})
def get_matches(matcher, pattern_ids, doc): def get_matches(tokenizer, phrases, texts, max_length=6):
matches = [] matcher = PhraseMatcher(tokenizer.vocab, phrases, max_length=max_length)
for label, start, end in matcher(doc): print("Match")
candidate = doc[start : end] for text in texts:
if pattern_ids[hash_string(candidate.text)] == True: doc = tokenizer(text)
start = candidate[0].idx matches = matcher(doc)
end = candidate[-1].idx + len(candidate[-1]) for mwe in doc.ents:
matches.append((start, end, candidate.root.tag_, candidate.text)) yield mwe
return matches
def merge_matches(doc, matches): def main(patterns_loc, text_loc, counts_loc, n=10000000):
for start, end, tag, text in matches:
doc.merge(start, end, tag, text, 'MWE')
def main():
nlp = English(parser=False, tagger=False, entity=False) nlp = English(parser=False, tagger=False, entity=False)
print("Make matcher")
phrases = read_gazetteer(nlp.tokenizer, patterns_loc, n=n)
counts = PreshCounter()
t1 = time.time()
for mwe in get_matches(nlp.tokenizer, phrases, read_text(text_loc)):
counts.inc(hash_string(mwe.text), 1)
t2 = time.time()
print("10m tokens in %d s" % (t2 - t1))
gazetteer = [u'M.I.A.', 'Shiny Happy People', 'James E. Jones'] with codecs.open(counts_loc, 'w', 'utf8') as file_:
example_text = u'The artist M.I.A. did a cover of Shiny Happy People. People is not an entity.' for phrase in read_gazetteer(nlp.tokenizer, patterns_loc, n=n):
pattern_ids = PreshMap() text = phrase.string
max_length = 0 key = hash_string(text)
for pattern_str in gazetteer: count = counts[key]
pattern = nlp.tokenizer(pattern_str) if count != 0:
bilou_tags = get_bilou(len(pattern)) file_.write('%d\t%s\n' % (count, text))
for word, tag in zip(pattern, bilou_tags):
lexeme = nlp.vocab[word.orth]
lexeme.set_flag(tag, True)
pattern_ids[hash_string(pattern.text)] = True
max_length = max(max_length, len(pattern))
matcher = make_matcher(nlp.vocab, max_length)
doc = nlp(example_text)
matches = get_matches(matcher, pattern_ids, doc)
merge_matches(doc, matches)
for token in doc:
print(token.text, token.ent_type_)
if __name__ == '__main__': if __name__ == '__main__':
if False:
import cProfile
import pstats
cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof")
s = pstats.Stats("Profile.prof")
s.strip_dirs().sort_stats("time").print_stats()
else:
plac.call(main) plac.call(main)

2
fabfile.py vendored
View File

@ -47,7 +47,7 @@ def prebuild(build_dir='/tmp/build_spacy'):
local('git clone %s .' % spacy_dir) local('git clone %s .' % spacy_dir)
local('virtualenv ' + build_venv) local('virtualenv ' + build_venv)
with prefix('cd %s && PYTHONPATH=`pwd` && . %s/bin/activate' % (build_dir, build_venv)): with prefix('cd %s && PYTHONPATH=`pwd` && . %s/bin/activate' % (build_dir, build_venv)):
local('pip install cython fabric fabtools') local('pip install cython fabric fabtools pytest')
local('pip install -r requirements.txt') local('pip install -r requirements.txt')
local('fab clean make') local('fab clean make')
local('cp -r %s/corpora/en/wordnet corpora/en/' % spacy_dir) local('cp -r %s/corpora/en/wordnet corpora/en/' % spacy_dir)

View File

@ -1,3 +1,4 @@
# -#- coding: utf-8 -*-
import json import json
contractions = {"n't", "'nt", "not", "'ve", "'d", "'ll", "'s", "'m", "'ma", "'re"} contractions = {"n't", "'nt", "not", "'ve", "'d", "'ll", "'s", "'m", "'ma", "'re"}
@ -114,6 +115,8 @@ hardcoded_specials = {
"'s": [{"F": "'s", "L": "'s"}], "'s": [{"F": "'s", "L": "'s"}],
"'S": [{"F": "'S", "L": "'s"}], "'S": [{"F": "'S", "L": "'s"}],
u"\u2018s": [{"F": u"\u2018s", "L": "'s"}],
u"\u2018S": [{"F": u"\u2018S", "L": "'s"}],
"'em": [{"F": "'em"}], "'em": [{"F": "'em"}],
@ -133,6 +136,8 @@ hardcoded_specials = {
"''": [{"F": "''"}], "''": [{"F": "''"}],
"": [{"F": "", "L": "--", "pos": ":"}],
"Corp.": [{"F": "Corp."}], "Corp.": [{"F": "Corp."}],
"Inc.": [{"F": "Inc."}], "Inc.": [{"F": "Inc."}],
"Co.": [{"F": "Co."}], "Co.": [{"F": "Co."}],
@ -336,7 +341,8 @@ hardcoded_specials = {
"E.G.": [{"F": "E.G."}], "E.G.": [{"F": "E.G."}],
"\n": [{"F": "\n", "pos": "SP"}], "\n": [{"F": "\n", "pos": "SP"}],
"\t": [{"F": "\t", "pos": "SP"}], "\t": [{"F": "\t", "pos": "SP"}],
" ": [{"F": " ", "pos": "SP"}] " ": [{"F": " ", "pos": "SP"}],
u"\xa0": [{"F": u"\xa0", "pos": "SP", "L": " "}]
} }
@ -412,6 +418,6 @@ def generate_specials():
if __name__ == "__main__": if __name__ == "__main__":
specials = generate_specials() specials = generate_specials()
with open("specials.json", "w") as f: with open("specials.json", "w") as file_:
json.dump(specials, f) file_.write(json.dumps(specials, indent=2))

View File

@ -27,5 +27,12 @@
["est", ""], ["est", ""],
["er", "e"], ["er", "e"],
["est", "e"] ["est", "e"]
],
"punct": [
["“", "\""],
["”", "\""],
["\u2018", "'"],
["\u2019", "'"]
] ]
} }

File diff suppressed because one or more lines are too long

View File

@ -13,5 +13,7 @@
"ADP": {"pos": "ADP"}, "ADP": {"pos": "ADP"},
"SYM": {"pos": "SYM"}, "SYM": {"pos": "SYM"},
"X": {"pos": "X"}, "X": {"pos": "X"},
"INTJ": {"pos": "INTJ"} "INTJ": {"pos": "INTJ"},
"DET": {"pos": "DET"},
"PART": {"pos": "PART"}
} }

View File

@ -2,43 +2,43 @@
"S": {"pos": "NOUN"}, "S": {"pos": "NOUN"},
"E": {"pos": "ADP"}, "E": {"pos": "ADP"},
"RD": {"pos": "DET"}, "RD": {"pos": "DET"},
"V": {"pos": "VER"}, "V": {"pos": "VERB"},
"_": {"pos": "_"}, "_": {"pos": "NO_TAG"},
"A": {"pos": "ADJ"}, "A": {"pos": "ADJ"},
"SP": {"pos": "PROP"}, "SP": {"pos": "PROPN"},
"FF": {"pos": "PUNC"}, "FF": {"pos": "PUNCT"},
"FS": {"pos": "PUNC"}, "FS": {"pos": "PUNCT"},
"B": {"pos": "ADV"}, "B": {"pos": "ADV"},
"CC": {"pos": "CON"}, "CC": {"pos": "CONJ"},
"FB": {"pos": "PUNC"}, "FB": {"pos": "PUNCT"},
"VA": {"pos": "AUX"}, "VA": {"pos": "AUX"},
"PC": {"pos": "PRO"}, "PC": {"pos": "PRON"},
"N": {"pos": "NUM"}, "N": {"pos": "NUM"},
"RI": {"pos": "DET"}, "RI": {"pos": "DET"},
"PR": {"pos": "PRO"}, "PR": {"pos": "PRON"},
"CS": {"pos": "SCON"}, "CS": {"pos": "SCONJ"},
"BN": {"pos": "ADV"}, "BN": {"pos": "ADV"},
"AP": {"pos": "DET"}, "AP": {"pos": "DET"},
"VM": {"pos": "AUX"}, "VM": {"pos": "AUX"},
"DI": {"pos": "DET"}, "DI": {"pos": "DET"},
"FC": {"pos": "PUNC"}, "FC": {"pos": "PUNCT"},
"PI": {"pos": "PRO"}, "PI": {"pos": "PRON"},
"DD": {"pos": "DET"}, "DD": {"pos": "DET"},
"DQ": {"pos": "DET"}, "DQ": {"pos": "DET"},
"PQ": {"pos": "PRO"}, "PQ": {"pos": "PRON"},
"PD": {"pos": "PRO"}, "PD": {"pos": "PRON"},
"NO": {"pos": "ADJ"}, "NO": {"pos": "ADJ"},
"PE": {"pos": "PRO"}, "PE": {"pos": "PRON"},
"T": {"pos": "DET"}, "T": {"pos": "DET"},
"X": {"pos": "SYM"}, "X": {"pos": "SYM"},
"SW": {"pos": "X"}, "SW": {"pos": "X"},
"NO": {"pos": "PRO"}, "NO": {"pos": "PRON"},
"I": {"pos": "INT"}, "I": {"pos": "INTJ"},
"X": {"pos": "X"}, "X": {"pos": "X"},
"DR": {"pos": "DET"}, "DR": {"pos": "DET"},
"EA": {"pos": "ADP"}, "EA": {"pos": "ADP"},
"PP": {"pos": "PRO"}, "PP": {"pos": "PRON"},
"X": {"pos": "NUM"}, "X": {"pos": "NUM"},
"DE": {"pos": "DET"}, "DE": {"pos": "DET"},
"X": {"pos": "PAR"} "X": {"pos": "PART"}
} }

View File

@ -134,13 +134,17 @@ def run_setup(exts):
headers_workaround.install_headers('numpy') headers_workaround.install_headers('numpy')
VERSION = '0.93' VERSION = '0.94'
def main(modules, is_pypy): def main(modules, is_pypy):
language = "cpp" language = "cpp"
includes = ['.', path.join(sys.prefix, 'include')] includes = ['.', path.join(sys.prefix, 'include')]
<<<<<<< HEAD
# This is gcc only. Also -03 is everywhere and is not recognized :() # This is gcc only. Also -03 is everywhere and is not recognized :()
# compile_args = ['-O3', '-Wno-strict-prototypes'] # compile_args = ['-O3', '-Wno-strict-prototypes']
compile_args = ['-Ox', '-EHsc'] compile_args = ['-Ox', '-EHsc']
=======
compile_args = ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function']
>>>>>>> refs/remotes/honnibal/master
link_args = [] link_args = []
# It is not prefix !!! # It is not prefix !!!
if sys.prefix == 'darwin': if sys.prefix == 'darwin':
@ -159,9 +163,13 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
'spacy.morphology', 'spacy.tagger', 'spacy.morphology', 'spacy.tagger',
'spacy.syntax.stateclass', 'spacy.syntax.stateclass',
'spacy._ml', 'spacy._theano', 'spacy._ml', 'spacy._theano',
<<<<<<< HEAD
'spacy.tokenizer', 'spacy.tokenizer',
#'spacy.en.attrs', #'spacy.en.attrs',
#'spacy.en.pos', #'spacy.en.pos',
=======
'spacy.tokenizer',
>>>>>>> refs/remotes/honnibal/master
'spacy.syntax.parser', 'spacy.syntax.parser',
'spacy.syntax.transition_system', 'spacy.syntax.transition_system',
'spacy.syntax.arc_eager', 'spacy.syntax.arc_eager',

View File

@ -7,7 +7,7 @@ import wget
import plac import plac
# TODO: Read this from the same source as the setup # TODO: Read this from the same source as the setup
VERSION = '0.9.0' VERSION = '0.9.1'
AWS_STORE = 'https://s3-us-west-1.amazonaws.com/media.spacynlp.com' AWS_STORE = 'https://s3-us-west-1.amazonaws.com/media.spacynlp.com'

View File

@ -1,5 +1,7 @@
import numpy import numpy
import codecs import io
import json
import ujson
import random import random
import re import re
import os import os

View File

@ -1,4 +1,4 @@
from __future__ import unicode_literals from __future__ import unicode_literals, print_function
from os import path from os import path
import codecs import codecs
@ -7,7 +7,7 @@ try:
except ImportError: except ImportError:
import json import json
from .parts_of_speech import NOUN, VERB, ADJ from .parts_of_speech import NOUN, VERB, ADJ, PUNCT
class Lemmatizer(object): class Lemmatizer(object):
@ -36,6 +36,8 @@ class Lemmatizer(object):
pos = 'verb' pos = 'verb'
elif pos == ADJ: elif pos == ADJ:
pos = 'adj' pos = 'adj'
elif pos == PUNCT:
pos = 'punct'
lemmas = lemmatize(string, self.index.get(pos, {}), self.exc.get(pos, {}), self.rules.get(pos, [])) lemmas = lemmatize(string, self.index.get(pos, {}), self.exc.get(pos, {}), self.rules.get(pos, []))
return lemmas return lemmas
@ -48,6 +50,9 @@ class Lemmatizer(object):
def adj(self, string): def adj(self, string):
return self(string, 'adj') return self(string, 'adj')
def punct(self, string):
return self(string, 'punct')
def lemmatize(string, index, exceptions, rules): def lemmatize(string, index, exceptions, rules):
string = string.lower() string = string.lower()
@ -58,7 +63,7 @@ def lemmatize(string, index, exceptions, rules):
for old, new in rules: for old, new in rules:
if string.endswith(old): if string.endswith(old):
form = string[:len(string) - len(old)] + new form = string[:len(string) - len(old)] + new
if form in index: if form in index or not form.isalpha():
forms.append(form) forms.append(form)
if not forms: if not forms:
forms.append(string) forms.append(string)

View File

@ -1,11 +1,18 @@
# cython: profile=True
from __future__ import unicode_literals
from os import path from os import path
from .typedefs cimport attr_t from .typedefs cimport attr_t
from .typedefs cimport hash_t
from .attrs cimport attr_id_t from .attrs cimport attr_id_t
from .structs cimport TokenC from .structs cimport TokenC, LexemeC
from .lexeme cimport Lexeme
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
from libcpp.vector cimport vector from libcpp.vector cimport vector
from murmurhash.mrmr cimport hash64
from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
from .attrs cimport FLAG13, FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25 from .attrs cimport FLAG13, FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
@ -15,6 +22,38 @@ from .vocab cimport Vocab
from libcpp.vector cimport vector from libcpp.vector cimport vector
from .attrs import FLAG61 as U_ENT
from .attrs import FLAG60 as B2_ENT
from .attrs import FLAG59 as B3_ENT
from .attrs import FLAG58 as B4_ENT
from .attrs import FLAG57 as B5_ENT
from .attrs import FLAG56 as B6_ENT
from .attrs import FLAG55 as B7_ENT
from .attrs import FLAG54 as B8_ENT
from .attrs import FLAG53 as B9_ENT
from .attrs import FLAG52 as B10_ENT
from .attrs import FLAG51 as I3_ENT
from .attrs import FLAG50 as I4_ENT
from .attrs import FLAG49 as I5_ENT
from .attrs import FLAG48 as I6_ENT
from .attrs import FLAG47 as I7_ENT
from .attrs import FLAG46 as I8_ENT
from .attrs import FLAG45 as I9_ENT
from .attrs import FLAG44 as I10_ENT
from .attrs import FLAG43 as L2_ENT
from .attrs import FLAG42 as L3_ENT
from .attrs import FLAG41 as L4_ENT
from .attrs import FLAG40 as L5_ENT
from .attrs import FLAG39 as L6_ENT
from .attrs import FLAG38 as L7_ENT
from .attrs import FLAG37 as L8_ENT
from .attrs import FLAG36 as L9_ENT
from .attrs import FLAG35 as L10_ENT
try: try:
import ujson as json import ujson as json
except ImportError: except ImportError:
@ -83,6 +122,32 @@ def _convert_strings(token_specs, string_store):
return converted return converted
def get_bilou(length):
if length == 1:
return [U_ENT]
elif length == 2:
return [B2_ENT, L2_ENT]
elif length == 3:
return [B3_ENT, I3_ENT, L3_ENT]
elif length == 4:
return [B4_ENT, I4_ENT, I4_ENT, L4_ENT]
elif length == 5:
return [B5_ENT, I5_ENT, I5_ENT, I5_ENT, L5_ENT]
elif length == 6:
return [B6_ENT, I6_ENT, I6_ENT, I6_ENT, I6_ENT, L6_ENT]
elif length == 7:
return [B7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, L7_ENT]
elif length == 8:
return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT]
elif length == 9:
return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, L9_ENT]
elif length == 10:
return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT,
I10_ENT, I10_ENT, L10_ENT]
else:
raise ValueError("Max length currently 10 for phrase matching")
def map_attr_name(attr): def map_attr_name(attr):
attr = attr.upper() attr = attr.upper()
if attr == 'ORTH': if attr == 'ORTH':
@ -95,32 +160,6 @@ def map_attr_name(attr):
return SHAPE return SHAPE
elif attr == 'NORM': elif attr == 'NORM':
return NORM return NORM
elif attr == 'FLAG13':
return FLAG13
elif attr == 'FLAG14':
return FLAG14
elif attr == 'FLAG15':
return FLAG15
elif attr == 'FLAG16':
return FLAG16
elif attr == 'FLAG17':
return FLAG17
elif attr == 'FLAG18':
return FLAG18
elif attr == 'FLAG19':
return FLAG19
elif attr == 'FLAG20':
return FLAG20
elif attr == 'FLAG21':
return FLAG21
elif attr == 'FLAG22':
return FLAG22
elif attr == 'FLAG23':
return FLAG23
elif attr == 'FLAG24':
return FLAG24
elif attr == 'FLAG25':
return FLAG25
else: else:
raise Exception("TODO: Finish supporting attr mapping %s" % attr) raise Exception("TODO: Finish supporting attr mapping %s" % attr)
@ -163,7 +202,7 @@ cdef class Matcher:
spec = _convert_strings(spec, self.vocab.strings) spec = _convert_strings(spec, self.vocab.strings)
self.patterns.push_back(init_pattern(self.mem, spec, etype)) self.patterns.push_back(init_pattern(self.mem, spec, etype))
def __call__(self, Doc doc): def __call__(self, Doc doc, acceptor=None):
cdef vector[Pattern*] partials cdef vector[Pattern*] partials
cdef int n_partials = 0 cdef int n_partials = 0
cdef int q = 0 cdef int q = 0
@ -174,21 +213,94 @@ cdef class Matcher:
for token_i in range(doc.length): for token_i in range(doc.length):
token = &doc.data[token_i] token = &doc.data[token_i]
q = 0 q = 0
# Go over the open matches, extending or finalizing if able. Otherwise,
# we over-write them (q doesn't advance)
for i in range(partials.size()): for i in range(partials.size()):
state = partials.at(i) state = partials.at(i)
if match(state, token): if match(state, token):
if is_final(state): if is_final(state):
matches.append(get_entity(state, token, token_i)) label, start, end = get_entity(state, token, token_i)
if acceptor is None or acceptor(doc, label, start, end):
matches.append((label, start, end))
else: else:
partials[q] = state + 1 partials[q] = state + 1
q += 1 q += 1
partials.resize(q) partials.resize(q)
# Check whether we open any new patterns on this token
for i in range(self.n_patterns): for i in range(self.n_patterns):
state = self.patterns[i] state = self.patterns[i]
if match(state, token): if match(state, token):
if is_final(state): if is_final(state):
matches.append(get_entity(state, token, token_i)) label, start, end = get_entity(state, token, token_i)
if acceptor is None or acceptor(doc, label, start, end):
matches.append((label, start, end))
else: else:
partials.push_back(state + 1) partials.push_back(state + 1)
doc.ents = [(e.label, e.start, e.end) for e in doc.ents] + matches doc.ents = [(e.label, e.start, e.end) for e in doc.ents] + matches
return matches return matches
cdef class PhraseMatcher:
cdef Pool mem
cdef Vocab vocab
cdef Matcher matcher
cdef PreshMap phrase_ids
cdef int max_length
cdef attr_t* _phrase_key
def __init__(self, Vocab vocab, phrases, max_length=10):
self.mem = Pool()
self._phrase_key = <attr_t*>self.mem.alloc(max_length, sizeof(attr_t))
self.max_length = max_length
self.vocab = vocab
self.matcher = Matcher(self.vocab, {})
self.phrase_ids = PreshMap()
for phrase in phrases:
if len(phrase) < max_length:
self.add(phrase)
abstract_patterns = []
for length in range(1, max_length):
abstract_patterns.append([{tag: True} for tag in get_bilou(length)])
self.matcher.add('Candidate', 'MWE', {}, abstract_patterns)
def add(self, Doc tokens):
cdef int length = tokens.length
assert length < self.max_length
tags = get_bilou(length)
assert len(tags) == length, length
cdef int i
for i in range(self.max_length):
self._phrase_key[i] = 0
for i, tag in enumerate(tags):
lexeme = self.vocab[tokens.data[i].lex.orth]
lexeme.set_flag(tag, True)
self._phrase_key[i] = lexeme.orth
cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0)
self.phrase_ids[key] = True
def __call__(self, Doc doc):
matches = []
for label, start, end in self.matcher(doc, acceptor=self.accept_match):
cand = doc[start : end]
start = cand[0].idx
end = cand[-1].idx + len(cand[-1])
matches.append((start, end, cand.root.tag_, cand.text, 'MWE'))
for match in matches:
doc.merge(*match)
return matches
def accept_match(self, Doc doc, int label, int start, int end):
assert (end - start) < self.max_length
cdef int i, j
for i in range(self.max_length):
self._phrase_key[i] = 0
for i, j in enumerate(range(start, end)):
self._phrase_key[i] = doc.data[j].lex.orth
cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0)
if self.phrase_ids.get(key):
return True
else:
return False

View File

@ -7,7 +7,7 @@ except ImportError:
import json import json
from .parts_of_speech import UNIV_POS_NAMES from .parts_of_speech import UNIV_POS_NAMES
from .parts_of_speech cimport ADJ, VERB, NOUN from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT
cdef class Morphology: cdef class Morphology:
@ -31,10 +31,7 @@ cdef class Morphology:
cdef int assign_tag(self, TokenC* token, tag) except -1: cdef int assign_tag(self, TokenC* token, tag) except -1:
cdef int tag_id cdef int tag_id
if isinstance(tag, basestring): if isinstance(tag, basestring):
try:
tag_id = self.reverse_index[self.strings[tag]] tag_id = self.reverse_index[self.strings[tag]]
except KeyError:
raise
else: else:
tag_id = tag tag_id = tag
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth) analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
@ -84,7 +81,7 @@ cdef class Morphology:
if self.lemmatizer is None: if self.lemmatizer is None:
return orth return orth
cdef unicode py_string = self.strings[orth] cdef unicode py_string = self.strings[orth]
if pos != NOUN and pos != VERB and pos != ADJ: if pos != NOUN and pos != VERB and pos != ADJ and pos != PUNCT:
return orth return orth
cdef set lemma_strings cdef set lemma_strings
cdef unicode lemma_string cdef unicode lemma_string

View File

@ -11,6 +11,7 @@ try:
except ImportError: except ImportError:
from text_unidecode import unidecode from text_unidecode import unidecode
import re import re
import math import math
@ -165,7 +166,7 @@ cpdef unicode norm1(unicode string, lower_pc=0.0, upper_pc=0.0, title_pc=0.0):
cpdef bytes asciied(unicode string): cpdef bytes asciied(unicode string):
cdef str stripped = unidecode(string) stripped = unidecode(string)
if not stripped: if not stripped:
return b'???' return b'???'
return stripped.encode('ascii') return stripped.encode('ascii')

View File

@ -9,7 +9,8 @@ from .transition_system cimport do_func_t, get_cost_func_t
from .transition_system cimport move_cost_func_t, label_cost_func_t from .transition_system cimport move_cost_func_t, label_cost_func_t
from ..gold cimport GoldParse from ..gold cimport GoldParse
from ..gold cimport GoldParseC from ..gold cimport GoldParseC
from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE
from ..lexeme cimport Lexeme
from libc.stdint cimport uint32_t from libc.stdint cimport uint32_t
from libc.string cimport memcpy from libc.string cimport memcpy
@ -379,8 +380,18 @@ cdef class ArcEager(TransitionSystem):
st.fast_forward() st.fast_forward()
cdef int finalize_state(self, StateClass st) nogil: cdef int finalize_state(self, StateClass st) nogil:
cdef int i
for i in range(st.length): for i in range(st.length):
if st._sent[i].head == 0 and st._sent[i].dep == 0: # Always attach spaces to the previous word
if Lexeme.c_check_flag(st._sent[i].lex, IS_SPACE):
st._sent[i].head = -1 if (i >= 1) else 1
if st._sent[i].sent_start and st._sent[i].head == -1:
st._sent[i].sent_start = False
# If we had this space token as the start of a sentence,
# move that sentence start forward one
if (i + 1) < st.length and not st._sent[i+1].sent_start:
st._sent[i+1].sent_start = True
elif st._sent[i].head == 0 and st._sent[i].dep == 0:
st._sent[i].dep = self.root_label st._sent[i].dep = self.root_label
# If we're not using the Break transition, we segment via root-labelled # If we're not using the Break transition, we segment via root-labelled
# arcs between the root words. # arcs between the root words.

View File

@ -21,6 +21,7 @@ from ..lexeme cimport Lexeme
from .spans cimport Span from .spans cimport Span
from .token cimport Token from .token cimport Token
from ..serialize.bits cimport BitArray from ..serialize.bits cimport BitArray
from ..util import normalize_slice
DEF PADDING = 5 DEF PADDING = 5
@ -81,20 +82,14 @@ cdef class Doc:
self._vector = None self._vector = None
def __getitem__(self, object i): def __getitem__(self, object i):
"""Get a token. """Get a Token or a Span from the Doc.
Returns: Returns:
token (Token): token (Token) or span (Span):
""" """
if isinstance(i, slice): if isinstance(i, slice):
if i.step is not None: start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
raise ValueError("Stepped slices not supported in Span objects." return Span(self, start, stop, label=0)
"Try: list(doc)[start:stop:step] instead.")
if i.start is None:
i = slice(0, i.stop)
if i.stop is None:
i = slice(i.start, len(self))
return Span(self, i.start, i.stop, label=0)
if i < 0: if i < 0:
i = self.length + i i = self.length + i

View File

@ -9,16 +9,16 @@ from ..structs cimport TokenC, LexemeC
from ..typedefs cimport flags_t, attr_t from ..typedefs cimport flags_t, attr_t
from ..attrs cimport attr_id_t from ..attrs cimport attr_id_t
from ..parts_of_speech cimport univ_pos_t from ..parts_of_speech cimport univ_pos_t
from ..util import normalize_slice
cdef class Span: cdef class Span:
"""A slice from a Doc object.""" """A slice from a Doc object."""
def __cinit__(self, Doc tokens, int start, int end, int label=0, vector=None, def __cinit__(self, Doc tokens, int start, int end, int label=0, vector=None,
vector_norm=None): vector_norm=None):
if start < 0: if not (0 <= start <= end <= len(tokens)):
start = tokens.length - start raise IndexError
if end < 0:
end = tokens.length - end
self.doc = tokens self.doc = tokens
self.start = start self.start = start
self.end = end self.end = end
@ -46,7 +46,13 @@ cdef class Span:
return 0 return 0
return self.end - self.start return self.end - self.start
def __getitem__(self, int i): def __getitem__(self, object i):
if isinstance(i, slice):
start, end = normalize_slice(len(self), i.start, i.stop, i.step)
start += self.start
end += self.start
return Span(self.doc, start, end)
if i < 0: if i < 0:
return self.doc[self.end + i] return self.doc[self.end + i]
else: else:

View File

@ -1,5 +1,5 @@
from os import path from os import path
import codecs import io
import json import json
import re import re
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
@ -7,8 +7,28 @@ from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
DATA_DIR = path.join(path.dirname(__file__), '..', 'data') DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
def normalize_slice(length, start, stop, step=None):
if not (step is None or step == 1):
raise ValueError("Stepped slices not supported in Span objects."
"Try: list(tokens)[start:stop:step] instead.")
if start is None:
start = 0
elif start < 0:
start += length
start = min(length, max(0, start))
if stop is None:
stop = length
elif stop < 0:
stop += length
stop = min(length, max(start, stop))
assert 0 <= start <= stop <= length
return start, stop
def utf8open(loc, mode='r'): def utf8open(loc, mode='r'):
return codecs.open(loc, mode, 'utf8') return io.open(loc, mode, encoding='utf8')
def read_lang_data(data_dir): def read_lang_data(data_dir):

View File

@ -7,7 +7,7 @@ from libc.stdint cimport uint64_t
import bz2 import bz2
from os import path from os import path
import codecs import io
import math import math
import json import json

View File

@ -3,6 +3,7 @@ import pytest
from spacy.matcher import Matcher from spacy.matcher import Matcher
@pytest.mark.xfail
def test_overlap_issue118(EN): def test_overlap_issue118(EN):
'''Test a bug that arose from having overlapping matches''' '''Test a bug that arose from having overlapping matches'''
doc = EN.tokenizer(u'how many points did lebron james score against the boston celtics last night') doc = EN.tokenizer(u'how many points did lebron james score against the boston celtics last night')

View File

@ -1,13 +1,13 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from os import path from os import path
import codecs import io
import pytest import pytest
@pytest.fixture @pytest.fixture
def sun_text(): def sun_text():
with codecs.open(path.join(path.dirname(__file__), '..', 'sun.txt'), 'r', 'utf8') as file_: with io.open(path.join(path.dirname(__file__), 'sun.txt'), 'r', encoding='utf8') as file_:
text = file_.read() text = file_.read()
return text return text

View File

@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy.lemmatizer import Lemmatizer, read_index, read_exc from spacy.lemmatizer import Lemmatizer, read_index, read_exc
@ -34,3 +35,9 @@ def test_noun_lemmas(lemmatizer):
assert do('planets') == set(['planet']) assert do('planets') == set(['planet'])
assert do('ring') == set(['ring']) assert do('ring') == set(['ring'])
assert do('axes') == set(['axis', 'axe', 'ax']) assert do('axes') == set(['axis', 'axe', 'ax'])
def test_smart_quotes(lemmatizer):
do = lemmatizer.punct
assert do('') == set(['"'])
assert do('') == set(['"'])

View File

@ -3,7 +3,7 @@ from __future__ import unicode_literals
import pytest import pytest
from spacy.en import attrs from spacy import attrs
def test_attr_of_token(EN): def test_attr_of_token(EN):

View File

@ -1,8 +1,8 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy.en import English from spacy.en import English
from spacy.en.attrs import IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT from spacy.attrs import IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT
from spacy.en.attrs import IS_SPACE, IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM from spacy.attrs import IS_SPACE, IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM
from spacy.en.attrs import IS_STOP from spacy.attrs import IS_STOP
import pytest import pytest

View File

@ -12,6 +12,72 @@ def test_getitem(EN):
with pytest.raises(IndexError): with pytest.raises(IndexError):
tokens[len(tokens)] tokens[len(tokens)]
def to_str(span):
return '/'.join(token.orth_ for token in span)
span = tokens[1:1]
assert not to_str(span)
span = tokens[1:4]
assert to_str(span) == 'it/back/!'
span = tokens[1:4:1]
assert to_str(span) == 'it/back/!'
with pytest.raises(ValueError):
tokens[1:4:2]
with pytest.raises(ValueError):
tokens[1:4:-1]
span = tokens[-3:6]
assert to_str(span) == 'He/pleaded'
span = tokens[4:-1]
assert to_str(span) == 'He/pleaded'
span = tokens[-5:-3]
assert to_str(span) == 'back/!'
span = tokens[5:4]
assert span.start == span.end == 5 and not to_str(span)
span = tokens[4:-3]
assert span.start == span.end == 4 and not to_str(span)
span = tokens[:]
assert to_str(span) == 'Give/it/back/!/He/pleaded/.'
span = tokens[4:]
assert to_str(span) == 'He/pleaded/.'
span = tokens[:4]
assert to_str(span) == 'Give/it/back/!'
span = tokens[:-3]
assert to_str(span) == 'Give/it/back/!'
span = tokens[-3:]
assert to_str(span) == 'He/pleaded/.'
span = tokens[4:50]
assert to_str(span) == 'He/pleaded/.'
span = tokens[-50:4]
assert to_str(span) == 'Give/it/back/!'
span = tokens[-50:-40]
assert span.start == span.end == 0 and not to_str(span)
span = tokens[40:50]
assert span.start == span.end == 7 and not to_str(span)
span = tokens[1:4]
assert span[0].orth_ == 'it'
subspan = span[:]
assert to_str(subspan) == 'it/back/!'
subspan = span[:2]
assert to_str(subspan) == 'it/back'
subspan = span[1:]
assert to_str(subspan) == 'back/!'
subspan = span[:-1]
assert to_str(subspan) == 'it/back'
subspan = span[-2:]
assert to_str(subspan) == 'back/!'
subspan = span[1:2]
assert to_str(subspan) == 'back'
subspan = span[-2:-1]
assert to_str(subspan) == 'back'
subspan = span[-50:50]
assert to_str(subspan) == 'it/back/!'
subspan = span[50:-50]
assert subspan.start == subspan.end == 4 and not to_str(subspan)
@pytest.mark.models @pytest.mark.models
def test_serialize(EN): def test_serialize(EN):

View File

@ -2,7 +2,7 @@ from __future__ import unicode_literals
import pytest import pytest
from spacy.en.attrs import * from spacy.attrs import *
def test_is_alpha(en_vocab): def test_is_alpha(en_vocab):

View File

@ -26,6 +26,7 @@ def test_main_entry_point(nlp):
doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first. doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.
@pytest.mark.models
def test_sentence_spans(nlp): def test_sentence_spans(nlp):
# from spacy.en import English # from spacy.en import English
# nlp = English() # nlp = English()
@ -33,6 +34,7 @@ def test_sentence_spans(nlp):
assert [s.root.orth_ for s in doc.sents] == ["is", "'s"] assert [s.root.orth_ for s in doc.sents] == ["is", "'s"]
@pytest.mark.models
def test_entity_spans(nlp): def test_entity_spans(nlp):
# from spacy.en import English # from spacy.en import English
# nlp = English() # nlp = English()
@ -44,6 +46,7 @@ def test_entity_spans(nlp):
assert ents[0].string == ents[0].string assert ents[0].string == ents[0].string
@pytest.mark.models
def test_noun_chunk_spans(nlp): def test_noun_chunk_spans(nlp):
# from spacy.en import English # from spacy.en import English
# nlp = English() # nlp = English()
@ -56,11 +59,12 @@ def test_noun_chunk_spans(nlp):
# NP three noun chunks <-- has # NP three noun chunks <-- has
@pytest.mark.models
def test_count_by(nlp): def test_count_by(nlp):
# from spacy.en import English, attrs # from spacy.en import English, attrs
# nlp = English() # nlp = English()
import numpy import numpy
from spacy.en import attrs from spacy import attrs
tokens = nlp('apple apple orange banana') tokens = nlp('apple apple orange banana')
assert tokens.count_by(attrs.ORTH) == {2529: 2, 4117: 1, 6650: 1} assert tokens.count_by(attrs.ORTH) == {2529: 2, 4117: 1, 6650: 1}
assert repr(tokens.to_array([attrs.ORTH])) == repr(numpy.array([[2529], assert repr(tokens.to_array([attrs.ORTH])) == repr(numpy.array([[2529],
@ -76,7 +80,7 @@ def test_read_bytes(nlp):
file_.write(nlp(u'This is a document.').to_bytes()) file_.write(nlp(u'This is a document.').to_bytes())
file_.write(nlp(u'This is another.').to_bytes()) file_.write(nlp(u'This is another.').to_bytes())
docs = [] docs = []
with open(loc) as file_: with open(loc, 'rb') as file_:
for byte_string in Doc.read_bytes(file_): for byte_string in Doc.read_bytes(file_):
docs.append(Doc(nlp.vocab).from_bytes(byte_string)) docs.append(Doc(nlp.vocab).from_bytes(byte_string))
assert len(docs) == 2 assert len(docs) == 2
@ -88,6 +92,7 @@ def test_token_span(doc):
assert token.i == 4 assert token.i == 4
@pytest.mark.models
def test_example_i_like_new_york1(nlp): def test_example_i_like_new_york1(nlp):
toks = nlp('I like New York in Autumn.') toks = nlp('I like New York in Autumn.')
@ -127,16 +132,19 @@ def dot(toks):
return tok(toks, "dot") return tok(toks, "dot")
@pytest.mark.models
def test_example_i_like_new_york3(toks, new, york): def test_example_i_like_new_york3(toks, new, york):
assert toks[new].head.orth_ == 'York' assert toks[new].head.orth_ == 'York'
assert toks[york].head.orth_ == 'like' assert toks[york].head.orth_ == 'like'
@pytest.mark.models
def test_example_i_like_new_york4(toks, new, york): def test_example_i_like_new_york4(toks, new, york):
new_york = toks[new:york+1] new_york = toks[new:york+1]
assert new_york.root.orth_ == 'York' assert new_york.root.orth_ == 'York'
@pytest.mark.models
def test_example_i_like_new_york5(toks, autumn, dot): def test_example_i_like_new_york5(toks, autumn, dot):
assert toks[autumn].head.orth_ == 'in' assert toks[autumn].head.orth_ == 'in'
assert toks[dot].head.orth_ == 'like' assert toks[dot].head.orth_ == 'like'
@ -144,6 +152,7 @@ def test_example_i_like_new_york5(toks, autumn, dot):
assert autumn_dot.root.orth_ == 'Autumn' assert autumn_dot.root.orth_ == 'Autumn'
@pytest.mark.models
def test_navigating_the_parse_tree_lefts(doc): def test_navigating_the_parse_tree_lefts(doc):
# TODO: where does the span object come from? # TODO: where does the span object come from?
span = doc[:2] span = doc[:2]
@ -151,6 +160,7 @@ def test_navigating_the_parse_tree_lefts(doc):
if span.doc[i].head in span] if span.doc[i].head in span]
@pytest.mark.models
def test_navigating_the_parse_tree_rights(doc): def test_navigating_the_parse_tree_rights(doc):
span = doc[:2] span = doc[:2]
rights = [span.doc[i] for i in range(span.end, len(span.doc)) rights = [span.doc[i] for i in range(span.end, len(span.doc))

View File

@ -1,6 +1,6 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest import pytest
import spacy.en import spacy
@pytest.fixture() @pytest.fixture()
@ -17,11 +17,12 @@ def test_load_resources_and_process_text():
@pytest.mark.models @pytest.mark.models
def test_get_tokens_and_sentences(doc): def test_get_tokens_and_sentences(doc):
token = doc[0] token = doc[0]
sentence = doc.sents.next() sentence = next(doc.sents)
assert token is sentence[0] assert token is sentence[0]
assert sentence.text == 'Hello, world.' assert sentence.text == 'Hello, world.'
@pytest.mark.models
def test_use_integer_ids_for_any_strings(nlp, token): def test_use_integer_ids_for_any_strings(nlp, token):
hello_id = nlp.vocab.strings['Hello'] hello_id = nlp.vocab.strings['Hello']
hello_str = nlp.vocab.strings[hello_id] hello_str = nlp.vocab.strings[hello_id]
@ -45,7 +46,7 @@ def test_get_and_set_string_views_and_flags(nlp, token):
def test_export_to_numpy_arrays(nlp, doc): def test_export_to_numpy_arrays(nlp, doc):
from spacy.en.attrs import ORTH, LIKE_URL, IS_OOV from spacy.attrs import ORTH, LIKE_URL, IS_OOV
attr_ids = [ORTH, LIKE_URL, IS_OOV] attr_ids = [ORTH, LIKE_URL, IS_OOV]
doc_array = doc.to_array(attr_ids) doc_array = doc.to_array(attr_ids)
@ -68,6 +69,7 @@ def test_word_vectors(nlp):
assert apples.similarity(oranges) > boots.similarity(hippos) assert apples.similarity(oranges) > boots.similarity(hippos)
@pytest.mark.models
def test_part_of_speech_tags(nlp): def test_part_of_speech_tags(nlp):
from spacy.parts_of_speech import ADV from spacy.parts_of_speech import ADV

View File

@ -12,9 +12,6 @@ site/index.html: src/jade/header.jade src/jade/*.jade
site/docs/: src/jade/docs/*.jade src/jade/header.jade site/docs/: src/jade/docs/*.jade src/jade/header.jade
jade -P src/jade/docs/index.jade --out $@ jade -P src/jade/docs/index.jade --out $@
site/license/: src/jade/license/*.jade src/jade/header.jade
jade -P src/jade/license/index.jade --out $@
site/blog/: src/jade/blog/*.jade site/blog/*/ site/tutorials/*/ src/jade/header.jade site/blog/: src/jade/blog/*.jade site/blog/*/ site/tutorials/*/ src/jade/header.jade
jade -P src/jade/blog/index.jade --out $@ jade -P src/jade/blog/index.jade --out $@

View File

@ -24,7 +24,7 @@ include ./meta.jade
p These days we just show the computer lots and lots and lots of words. We gave up trying to get it to understand what a “dress” is. We let #[em dress] be just some letters. But if it is seen it around #[em girl] enough times (which is just some other letters, which are seen around some #[strong other] other letters), it can make good guesses. p These days we just show the computer lots and lots and lots of words. We gave up trying to get it to understand what a “dress” is. We let #[em dress] be just some letters. But if it is seen it around #[em girl] enough times (which is just some other letters, which are seen around some #[strong other] other letters), it can make good guesses.
p It doesn't always guess right, but we can tell how often it does, and we can think of ways t help it learn better. We have a number, and we can slowly make it bigger, a little bit by a little bit. p It doesn't always guess right, but we can tell how often it does, and we can think of ways to help it learn better. We have a number, and we can slowly make it bigger, a little bit by a little bit.
p (One thing I've learned is, people are great at making a number bigger, if you pay a lot of them to try. The key is to pick numbers where, if they make the number bigger, they can't help but have done something actually good. This is harder than it sounds. Some say no numbers are like this. I ask them to show me much good being done another way, but they never can.) p (One thing I've learned is, people are great at making a number bigger, if you pay a lot of them to try. The key is to pick numbers where, if they make the number bigger, they can't help but have done something actually good. This is harder than it sounds. Some say no numbers are like this. I ask them to show me much good being done another way, but they never can.)

View File

@ -20,6 +20,11 @@ mixin Option(name, open)
| $ conda install spacy | $ conda install spacy
| $ python -m spacy.en.download all | $ python -m spacy.en.download all
p Latest stable conda packages are available from the spacy channel:
pre.language-bash: code
| $ conda install -c https://conda.anaconda.org/spacy spacy
+Option("pip and virtualenv", true) +Option("pip and virtualenv", true)
p With Python 2.7 or Python 3, using Linux or OSX, ensure that you have the following packages installed: p With Python 2.7 or Python 3, using Linux or OSX, ensure that you have the following packages installed:

View File

@ -1,83 +0,0 @@
mixin Option(name, open)
details(open=open)
summary
h4= name
block
article.post
header
h2 #[a(href=Meta.url)
p What's new in v0.90?
.subhead by #[a(href="//twitter.com/spacy_io", rel="author" target="_blank") #{spaCy}] on #[time #{getDate(Meta.date).fulldate}]
ul
li Support for gazetteers
li Set Lexeme attributes
#[a.readmore(href=Meta.url) Full Change Log ►]
section.intro
p What's
+Option("conda", true)
pre.language-bash: code
| $ conda install spacy
| $ python -m spacy.en.download
+Option("pip and virtualenv", true)
p With Python 2.7 or Python 3, using Linux or OSX, run:
pre.language-bash: code
| $ pip install spacy
| $ python -m spacy.en.download
p
| The download command fetches and installs about 300mb of data, for
| the parser model and word vectors, which it installs within the spacy.en
| package directory.
+Option("Workaround for obsolete system Python", false)
p
| If you're stuck using a server with an old version of Python, and you
| don't have root access, I've prepared a bootstrap script to help you
| compile a local Python install. Run:
pre.language-bash: code
| $ curl https://raw.githubusercontent.com/honnibal/spaCy/master/bootstrap_python_env.sh | bash && source .env/bin/activate
+Option("Compile from source", false)
p
| The other way to install the package is to clone the github repository,
| and build it from source. This installs an additional dependency,
| Cython. If you're using Python 2, I also recommend installing fabric
| and fabtools &ndash; this is how I build the project.
pre.language-bash: code
| $ git clone https://github.com/honnibal/spaCy.git
| $ cd spaCy
| $ virtualenv .env && source .env/bin/activate
| $ export PYTHONPATH=`pwd`
| $ pip install -r requirements.txt
| $ python setup.py build_ext --inplace
| $ python -m spacy.en.download
| $ pip install pytest
| $ py.test tests/
p
| Python packaging is awkward at the best of times, and it's particularly tricky
| with C extensions, built via Cython, requiring large data files. So,
| please report issues as you encounter them.
+Option("pypy (Unsupported)")
| If PyPy support is a priority for you, please get in touch. We could likely
| fix the remaining issues, if necessary. However, the library is likely to
| be much slower on PyPy, as it's written in Cython, which produces code tuned
| for the performance of CPython.
+Option("Windows (Unsupported)")
| Unfortunately we don't currently support Windows.

View File

@ -29,10 +29,10 @@ include ../header.jade
li: a.button(href="#example-use") Examples li: a.button(href="#example-use") Examples
li: a.button(href="#install") li: a.button(href="#install")
| Install | Install
<span class="button-caption">v0.93</span> <span class="button-caption">v0.94</span>
article.page.landing-page article.page.landing-page
+Section("Comparisons and Benchmarks", "comparisons", "./_comparisons.jade") +Section("Comparisons and Benchmarks", "comparisons", "./_comparisons.jade")
+Section("Online Demo", "online-demo", "./_online_demo.jade") +Section("Online Demo", "online-demo", "./_online_demo.jade")
+Section("Usage by Example", "example-use", "./_usage_examples.jade") +Section("Usage by Example", "example-use", "./_usage_examples.jade")
+Section("Install v0.93", "install", "./_installation.jade") +Section("Install v0.94", "install", "./_installation.jade")

View File

@ -1,38 +0,0 @@
include ../header.jade
mixin LicenseOption(name, period, price, audience)
.item
h4 #{name}
.focus #{period}
span #{price}
h5 Suggested for:
span #{audience}
a.button(href="/resources/pdf/spaCy_License_Agreement_2015.pdf", target="_blank") Download license
span or #[a(href="mailto:sales@spacy.io") get in touch]
- var Page = InitPage(Site, Authors.spacy, "license", "License")
+WritePage(Site, Authors.spacy, Page)
article.pricing
.box.license
+LicenseOption("Trial", "90 days", "$0", "Evaluation")
+LicenseOption("Production", "1 year", "$5,000", "Production")
+LicenseOption("Certainty", "5 years", "$20,000", "Secure Planning")
p.caption Researcher, hobbyist, or open-source developer? spaCy also offers #[a(href="http://www.gnu.org/licenses/agpl-3.0.en.html") AGPLv3] licenses.
blockquote.pull-quote
p Let's face it: Services disappear. Constantly. The good start-ups get bought; the bad ones go bankrupt.
p You need the source, and you need to know you can buy a long-term license. So that's what we offer. The difference between this and a black-box API is night and day.
p Let's face it: services disappear. Constantly. The good start-ups get bought; the bad ones go bankrupt. Open-source projects become abandoned or bloated. Google's graveyard is over-flowing &ndash; ditto for Yahoo!, Microsoft, etc. Sure, IBM won't be broke...But will BlueMix be sunset?
p A 5 year license won't expire until 2020. spaCy will be with you for longer than most of your current staff. If that's still not enough, get in touch. We can surely work something out.