In [None]:
import pandas as pd

from pyrdf2vec import RDF2VecTransformer
from pyrdf2vec.embedders import Word2Vec
from pyrdf2vec.graphs import KG
from pyrdf2vec.walkers import RandomWalker

path_to_kg = "swemls-kg-2023-03.ttl"

In [None]:
# Load KG for embedding
kg = KG(path_to_kg)

In [None]:
# get all papers from the graph
from rdflib import Graph, RDF, URIRef
g = Graph()
g.parse(path_to_kg)
entities = set()
for e in g.subjects(RDF.type,URIRef("https://w3id.org/semsys/ns/swemls#SemanticWebResource")):
    if not((e, RDF.type, URIRef("https://w3id.org/semsys/ns/swemls#AndCompound")) in g or (e, RDF.type, URIRef("https://w3id.org/semsys/ns/swemls#OrCompound")) in g):
        s = e.toPython()
        if s.startswith("http") and not (s.startswith("http://semantic-systems.net/swemls/Custom.")):
            entities.add(s)
lstentities = []
for e in entities:
    lstentities.append(e)
print(lstentities)

In [None]:
# We specify the depth and maximum number of walks per entity
random_walker = RandomWalker(8, 500)
walkers = []
for i in range(1):
    walkers.append(random_walker)

In [None]:
# Learn the embeddings
transformer = RDF2VecTransformer(walkers=walkers, embedder=Word2Vec(sg=1, vector_size=200, hs=1, window=5, min_count=0))
embeddings,_ = transformer.fit_transform(kg, lstentities)

In [None]:
# Prepare vectors
dfEntities = pd.DataFrame(lstentities)
dfVectors = pd.DataFrame.from_records(embeddings)
dfJoined = pd.concat([dfEntities,dfVectors],axis=1,join="inner")
dfJoined.head(25)

In [None]:
# fit NN model for retrieving nearest neighbors
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(n_neighbors=10, algorithm='auto', metric='cosine')
knn.fit(dfVectors.to_numpy())

# find vector for a specific dataset
# example here: DBpedia
model_uri = "http://semantic-systems.net/swemls/Resource.DBpedia"
vector = dfVectors[dfEntities[0]==model_uri].mean()

    
for index in knn.kneighbors([vector] ,10, return_distance=False)[0]:
    print(dfEntities.at[index,0])

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import random as rd
pca = PCA(n_components=2)
pca_result = pca.fit_transform(dfVectors)
principalDf = pd.DataFrame(data = pca_result
             , columns = ['principal component 1', 'principal component 2'])
#print(principalDf)

finalDf = principalDf
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlim([-0.3,0.5])
ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
plt.scatter(finalDf['principal component 1']
               , finalDf['principal component 2'], c='b')

i = 0
for str in dfEntities[0]:
    str = str[str.rindex("."):]
    ax.annotate(str, (finalDf.at[i,'principal component 1'],finalDf.at[i,'principal component 2']))
    i = i+1

ax.grid()
plt.show()