# -*- coding: utf-8 -*-
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas
# Arabic Stop words
arastopword = stopwords.words('arabic')
names = ['aya']
data = pandas.read_csv('data/qr_with/quran.txt', names=names)
print("All Ayat: ", len(data ))
print(data)
# Enter terms
Searchfor = 'هارون وزير فرعون'
# similarity degree
simdegree = 0.2
Result = []
def findsimilarity(X, Y):
# tokenization
X_list = word_tokenize(X)
Y_list = word_tokenize(Y)
# sw contains the list of stopwords
#sw = stopwords.words('arabic')
l1 =[];l2 =[]
# remove stop words from string
X_set = {w for w in X_list if not w in arastopword}
Y_set = {w for w in Y_list if not w in arastopword}
# form a set containing keywords of both strings
rvector = X_set.union(Y_set)
for w in rvector:
#print(w)
if w in X_set: l1.append(1) # create a vector
else: l1.append(0)
if w in Y_set: l2.append(1)
else: l2.append(0)
c = 0
# cosine formula
for i in range(len(rvector)):
c+= l1[i]*l2[i]
cosine = c / float((sum(l1)*sum(l2))**0.5)
#print("similarity: " , cosine)
if cosine > simdegree:
Result.append([Y, round(cosine, 4)])
for d in data['aya']:
findsimilarity(Searchfor, d)
def takeSecond(elem):
return elem[1]
# sort the result
Result.sort(key=takeSecond)
# print the final result
for j in Result:
print(j[0])
print(j[1])
print('')
وأضل فرعون قومه وما هدى
0.25
قال فرعون وما رب العالمين
0.25
فأرسل فرعون في المدائن حاشرين
0.25
من فرعون إنه كان عاليا من المسرفين
0.25
اذهب إلى فرعون إنه طغى
0.2887
اذهبا إلى فرعون إنه طغى
0.2887
قوم فرعون ألا يتقون
0.2887
اذهب إلى فرعون إنه طغى
0.2887
هارون أخي
0.3536
فرعون وثمود
0.3536
طريقة رسم الكلمات المتشابهه من القران الكريم على شكل word cloud باستخدام الفكتور مودلword2vec لحساب الكلمات المتشابهة، يمكنك تحميل المودل واتباع خطوات البرنامج للحصول على نفس النتائج.
from gensim.models import KeyedVectors
from bidi.algorithm import get_display
import arabic_reshaper
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from typing import List, Dict
# function to plot the word cloud
def plot_word_cloud(word_list: List[str], word_frequency: Dict[str, float]):
full_string = ' '.join(word_list)
reshaped_text = arabic_reshaper.reshape(full_string)
translated_text = get_display(reshaped_text)
# Build the Arabic word cloud
wordc = WordCloud(font_path='tahoma',background_color='white', width=800, height=300).generate(translated_text)
wordc.fit_words(word_frequency)
# Draw the word cloud
plt.imshow(wordc)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()
# load the model
model = KeyedVectors.load('model/quran_w7_m15.bin')
print("Model loaded")
#check the model size
print ('Number of all words: ', len(model.wv.vocab))
# Enter the word you want to search
Word_to_plot = 'النهار'
#result size
retsize = 200
temp_tuple = model.most_similar(positive=[Word_to_plot], negative=[], topn = retsize)
similar_words=[i[0] for i in temp_tuple]
word_frequency = {}
for word_tuple in temp_tuple:
reshaped_word = arabic_reshaper.reshape(word_tuple[0])
key = get_display(reshaped_word)
word_frequency[key] = word_tuple[1]
plot_word_cloud(similar_words, word_frequency)
Result:
# Enter the word you want to search
Word_to_plot = 'كريم'
#result size
retsize = 200
# Enter the word you want to search
Word_to_plot = 'النار'
#result size
retsize = 200