https://wortschatz.uni-leipzig.de/en/download/arabic
for the current example dataset
find the POS tag for each words in the text by using this format
word tag word2 tag2 …
from stanfordcorenlp import StanfordCoreNLP
def find_pos(xsent):
keepmyfinal =''
with StanfordCoreNLP(r'stanford-corenlp-4.1.0', lang='ar') as nlp:
#print(nlp.word_tokenize(sentence))
Keepres = nlp.pos_tag(xsent)
for k in Keepres:
keepmyfinal += "{} {}\t".format( convert_ara_to_bw(k[0]),k[1])
return keepmyfinal
Let us get some result:
find_pos('ألا إنهم هم المفسدون ولكن لا يشعرون').rstrip()
Result:
>lA IN
<n IN
hm PRP
hm PRP
Almfsdwn DTNNS
w CC
lkn CC
lA RP
y$Erwn VBP
Read The file and find words shares the same tag:
Read text file
loadUnqList(p):
klist = []
with open(p) as fword:
klist = fword.read().splitlines()
return klist
KeepQuran = []
loadquran = loadUnqList('sample_msa_fixed.fo')
print(len(loadquran))
# Result
50
we can search for tags like NNP noun
search_Tag = 'NNP'
numres = 200
keepres = []
for i in loadquran:
xx = i.split('\t')
for i in xx:
xi = i.split(' ')
if xi[1] == search_Tag:
keepres.append(convert_bw_to_ara(xi[0]))
# Count the
word frequency for each word
counts_nsw = collections.Counter(keepres)
clean_tweets_nsw = pd.DataFrame(counts_nsw.most_common(numres), columns=['words', 'count'])
similar_words=[i[0] for i in counts_nsw.most_common(numres)]
word_frequency = {}
# plot the result
for word_tuple in counts_nsw.most_common(numres):
reshaped_word = arabic_reshaper.reshape(word_tuple[0])
key = get_display(reshaped_word)
word_frequency[key] = word_tuple[1]
def plot_word_cloud(word_list: List[str], word_frequency: Dict[str, float]):
full_string = ' '.join(word_list)
reshaped_text = arabic_reshaper.reshape(full_string)
translated_text = get_display(reshaped_text)
# Build the Arabic word cloud
wordc = WordCloud(font_path='tahoma',background_color='white', width=800, height=300).generate(translated_text)
wordc.fit_words(word_frequency)
plt.imshow(wordc)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.title('Search in Quran Tags, By Faisal Alshargi')
plt.show()
Result:
3 بنت
1 عبدالله 3
2 بن 3
3 عبدالعزيز 3
4 آل 3
.. … …
56 جدة 1
57 أبو 1
58 أمريكا 1
59 أيار 1
60 سوريا 1

Search for past verbs in the text:
search_Tag = 'VBD'
numres = 200

Search for present verbs in the text
search_Tag = ‘VBP’
numres = 200
