create_kb

Created Diff never expires
1 removal
66 lines
7 additions
73 lines
import typer
import typer
import csv
import csv
import os
import os
from pathlib import Path
from pathlib import Path


import spacy
import spacy
from spacy.kb import KnowledgeBase
from spacy.kb import KnowledgeBase




def main(entities_loc: Path, vectors_model: str, kb_loc: Path, nlp_dir: Path):
def main(entities_loc: Path, vectors_model: str, kb_loc: Path, nlp_dir: Path):
""" Step 1: create the Knowledge Base in spaCy and write it to file """
""" Step 1: create the Knowledge Base in spaCy and write it to file """


# First: create a simpel model from predefined vectors and a simpel EntityRuler component
# First: create a simpel model from predefined vectors and a simpel EntityRuler component
# A more realistic use-case would use a pretrained NER model instead
# A more realistic use-case would use a pretrained NER model instead
nlp = spacy.load(vectors_model, exclude="parser, tagger, ner")
nlp = spacy.load(vectors_model, exclude="parser, ner")
ruler = nlp.add_pipe("entity_ruler")
ruler = nlp.add_pipe("entity_ruler")
patterns = [{"label": "PERSON", "pattern": [{"LOWER": "emerson"}]}]
patterns = [{"label": "PERSON", "pattern": [{"LOWER": "emerson"}]}]
patterns.append({"label": "PERSON", "pattern": [{"LOWER": "fred"}, {"LOWER": "stolle"}]})

ruler.add_patterns(patterns)
ruler.add_patterns(patterns)
nlp.add_pipe("sentencizer")
nlp.add_pipe("sentencizer")


name_dict, desc_dict = _load_entities(entities_loc)
name_dict, desc_dict = _load_entities(entities_loc)


kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=300)
kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=300)


for qid, desc in desc_dict.items():
for qid, desc in desc_dict.items():
desc_doc = nlp(desc)
desc_doc = nlp(desc)
desc_enc = desc_doc.vector
desc_enc = desc_doc.vector
# Set arbitrary value for frequency
# Set arbitrary value for frequency
kb.add_entity(entity=qid, entity_vector=desc_enc, freq=342)
kb.add_entity(entity=qid, entity_vector=desc_enc, freq=342)


kb.add_entity(entity="1", entity_vector=nlp("fred stoll 1").vector, freq=15)
kb.add_entity(entity="2", entity_vector=nlp("fred stoll 2").vector, freq=60)

for qid, name in name_dict.items():
for qid, name in name_dict.items():
# set 100% prior probability P(entity|alias) for each unique name
# set 100% prior probability P(entity|alias) for each unique name
kb.add_alias(alias=name, entities=[qid], probabilities=[1])
kb.add_alias(alias=name, entities=[qid], probabilities=[1])

kb.add_alias(alias="fred stolle", entities=["1", "2"], probabilities=[0.4,0.6])


qids = name_dict.keys()
qids = name_dict.keys()
probs = [0.3 for qid in qids]
probs = [0.3 for qid in qids]
# ensure that sum([probs]) <= 1 when setting aliases
# ensure that sum([probs]) <= 1 when setting aliases
kb.add_alias(alias="Emerson", entities=qids, probabilities=probs) #
kb.add_alias(alias="Emerson", entities=qids, probabilities=probs) #


print(f"Entities in the KB: {kb.get_entity_strings()}")
print(f"Entities in the KB: {kb.get_entity_strings()}")
print(f"Aliases in the KB: {kb.get_alias_strings()}")
print(f"Aliases in the KB: {kb.get_alias_strings()}")
print()
print()
kb.to_disk(kb_loc)
kb.to_disk(kb_loc)
if not os.path.exists(nlp_dir):
if not os.path.exists(nlp_dir):
os.mkdir(nlp_dir)
os.mkdir(nlp_dir)
nlp.to_disk(nlp_dir)
nlp.to_disk(nlp_dir)




def _load_entities(entities_loc: Path):
def _load_entities(entities_loc: Path):
""" Helper function to read in the pre-defined entities we want to disambiguate to. """
""" Helper function to read in the pre-defined entities we want to disambiguate to. """
names = dict()
names = dict()
descriptions = dict()
descriptions = dict()
with entities_loc.open("r", encoding="utf8") as csvfile:
with entities_loc.open("r", encoding="utf8") as csvfile:
csvreader = csv.reader(csvfile, delimiter=",")
csvreader = csv.reader(csvfile, delimiter=",")
for row in csvreader:
for row in csvreader:
qid = row[0]
qid = row[0]
name = row[1]
name = row[1]
desc = row[2]
desc = row[2]
names[qid] = name
names[qid] = name
descriptions[qid] = desc
descriptions[qid] = desc
return names, descriptions
return names, descriptions




if __name__ == "__main__":
if __name__ == "__main__":
typer.run(main)
typer.run(main)