mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			46 lines
		
	
	
		
			1.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			46 lines
		
	
	
		
			1.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# coding: utf-8
 | 
						|
"""
 | 
						|
Example of loading previously parsed text using spaCy's DocBin class. The example
 | 
						|
performs an entity count to show that the annotations are available.
 | 
						|
For more details, see https://spacy.io/usage/saving-loading#docs
 | 
						|
Installation:
 | 
						|
python -m spacy download en_core_web_lg
 | 
						|
Usage:
 | 
						|
python examples/load_from_docbin.py en_core_web_lg RC_2015-03-9.spacy
 | 
						|
"""
 | 
						|
from __future__ import unicode_literals
 | 
						|
 | 
						|
import spacy
 | 
						|
from spacy.tokens import DocBin
 | 
						|
from timeit import default_timer as timer
 | 
						|
from collections import Counter
 | 
						|
 | 
						|
EXAMPLE_PARSES_PATH = "RC_2015-03-9.spacy"
 | 
						|
 | 
						|
 | 
						|
def main(model="en_core_web_lg", docbin_path=EXAMPLE_PARSES_PATH):
 | 
						|
    nlp = spacy.load(model)
 | 
						|
    print("Reading data from {}".format(docbin_path))
 | 
						|
    with open(docbin_path, "rb") as file_:
 | 
						|
        bytes_data = file_.read()
 | 
						|
    nr_word = 0
 | 
						|
    start_time = timer()
 | 
						|
    entities = Counter()
 | 
						|
    docbin = DocBin().from_bytes(bytes_data)
 | 
						|
    for doc in docbin.get_docs(nlp.vocab):
 | 
						|
        nr_word += len(doc)
 | 
						|
        entities.update((e.label_, e.text) for e in doc.ents)
 | 
						|
    end_time = timer()
 | 
						|
    msg = "Loaded {nr_word} words in {seconds} seconds ({wps} words per second)"
 | 
						|
    wps = nr_word / (end_time - start_time)
 | 
						|
    print(msg.format(nr_word=nr_word, seconds=end_time - start_time, wps=wps))
 | 
						|
    print("Most common entities:")
 | 
						|
    for (label, entity), freq in entities.most_common(30):
 | 
						|
        print(freq, entity, label)
 | 
						|
 | 
						|
 | 
						|
if __name__ == "__main__":
 | 
						|
    import plac
 | 
						|
 | 
						|
    plac.call(main)
 |