Install :

$ pip install StanfordCoreNLP

And Download stanford-corenlp-4.1.0 and save it in your project’s folder

To Download more Arabic datasets go to Leipzig collection corporate website.

https://wortschatz.uni-leipzig.de/en/download/arabic

for the current example dataset

find the POS tag for each words in the text by using this format

word <space> tag <tap> word2 <space> tag2 …

from stanfordcorenlp import StanfordCoreNLP

def find_pos(xsent): 
    keepmyfinal =''
    with StanfordCoreNLP(r'stanford-corenlp-4.1.0', lang='ar') as nlp:
        #print(nlp.word_tokenize(sentence))
        Keepres = nlp.pos_tag(xsent)
        for k in Keepres:
            keepmyfinal += "{} {}\t".format( convert_ara_to_bw(k[0]),k[1])      
            
        
    return keepmyfinal

Let us get some result:

find_pos('ألا إنهم هم المفسدون ولكن لا يشعرون').rstrip()
Result:
>lA IN
<n IN
hm PRP
hm PRP
Almfsdwn DTNNS
w CC
lkn CC
lA RP
y$Erwn VBP

Read The file and find words shares the same tag:

Read text file

loadUnqList(p):
    klist = []
    with open(p) as fword:
        klist = fword.read().splitlines()
    return klist



KeepQuran = []
loadquran = loadUnqList('sample_msa_fixed.fo')  
print(len(loadquran))


# Result
50

we can search for tags like NNP noun

search_Tag = 'NNP'
numres = 200

keepres = []
for i in loadquran:
    xx = i.split('\t')
    for i in xx:
        xi = i.split(' ')
        if xi[1] == search_Tag:
            keepres.append(convert_bw_to_ara(xi[0]))
   


# Count the 
word frequency for each word

counts_nsw = collections.Counter(keepres)                        
clean_tweets_nsw = pd.DataFrame(counts_nsw.most_common(numres), columns=['words', 'count'])
similar_words=[i[0] for i in counts_nsw.most_common(numres)]
     

word_frequency = {}


# plot the result

for word_tuple in counts_nsw.most_common(numres):
    reshaped_word = arabic_reshaper.reshape(word_tuple[0])
    key = get_display(reshaped_word)
    word_frequency[key] = word_tuple[1]     
    
   

def plot_word_cloud(word_list: List[str], word_frequency: Dict[str, float]):
    full_string = ' '.join(word_list)
    reshaped_text = arabic_reshaper.reshape(full_string)
    translated_text = get_display(reshaped_text)   
    # Build the Arabic word cloud
    wordc = WordCloud(font_path='tahoma',background_color='white', width=800, height=300).generate(translated_text)
    wordc.fit_words(word_frequency)
    plt.imshow(wordc)

    plt.axis("off")
    plt.tight_layout(pad = 0)
    plt.title('Search in Quran Tags, By Faisal Alshargi')

    plt.show()
    

Result:

3 بنت
1 عبدالله 3
2 بن 3
3 عبدالعزيز 3
4 آل 3
.. … …
56 جدة 1
57 أبو 1
58 أمريكا 1
59 أيار 1
60 سوريا 1

Search for past verbs in the text

search_Tag = 'VBD'
numres = 200

Search for present verbs in the text

search_Tag = ‘VBP’
numres = 200