Résumé de texte

Résumé de texte 

APERÇU

BASE DE DONNÉES

SOMMAIRE DES NOUVELLES

Génération de courtes descriptions d’articles de presse.

www.kaggle.com

DES MODÈLES

> MODÈLE TÊTE

Collaboration Google

Éditer la description

colab.research.google.com

> SOMMAIRE 3 MODÈLE

Collaboration Google

Éditer la description

colab.research.google.com

RÉSUMÉ EXTRACTIF

> SOMMAIRE 1

de sklearn.feature_extraction.text import TfidfVectorizer 
de spacy.lang.en import anglais 
numpy as np nlp

 = English() 
nlp.add_pipe( 'sentencizer' ) 



def  resumer ( text, tokenizer=nlp, max_sent_in_summary= 3 ): 
    
    doc = nlp (text.replace( "\n" , "" )) 
    sentences = [sent.text.strip() for sent in doc.sents] 
    
    sentence_organizer = {k:v for v,k in  enumerate (sentences)}
    
    vectorizer = TfidfVectorizer(min_df= 2 , max_features= None , 
                                        strip_accents= 'unicode' , 
                                        analyzer= 'word' , 
                                        token_pattern= r'\w{1,}' , 
                                        ngram_range=( 1 , 3 ), 
                                        use_idf= 1 ,smooth_idf= 1 , 
                                        sublinear_tf= 1 , 
                                        stop_words ='anglais' ) 
    
    vectorizer.fit(sentences) 
    vectors = vectorizer.transform(sentences) 
    scores = np.array(vectors. sum (axis= 1 )).ravel() 
    N = max_sent_in_summary 
    top_n_sentences = [sentences[ind] for ind in np.argsort(scores, axis= 0 )[::- 1 ][:N]] 
    
    top_n = [(sentence,sentence_organizer[sentence]) for phrase in top_n_sentences] 
   
    top_n = trié (top_n, key = lambda x: x[ 1 ]) 
    order_scored_sentences = [element[ 0] for element in top_n] 
   
    summary = " " .join(ordered_scored_sentences) 
    return summary

> SOMMAIRE 2

import nltk     #Natural Language ToolKit
 nltk.download( 'stopwords' ) 
from nltk.corpus import stopwords 
from nltk.cluster.util import cosine_distance 
import numpy as np 
import networkx as nx 

def  read_article ( file_name ): 
    with  open (file_name) as f: 
        a = " " .join(line.strip() for line in f) 
    with  open (file_name,"r+" ) as f: 
        f.truncate( 0 ) 
        f.write(a) 
    file = open (file_name, "r" ) 
    filedata = file.readlines() 
    #print(filedata)
     article = filedata[ 0 ].split( ". " ) 
    sentances=[] 
    pour la phrase dans l' article : 
        sentances.append(sentance.replace( "[^a-zA-Z]" , "" ).split( " " )) 
    #sentances.pop() 
    renvoie les phrases 

def  sentance_similarity ( envoyé1, envoyé2,mots vides=None ):
     si les mots vides sont  None :stopwords
        =[]
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
    all_words = list ( set (sent1+sent2))

    vector1 = [ 0 ] * len (all_words)
    vector2= [ 0 ] * len (all_words)
     for w in sent1 :
         if w in stopwords :
             continue
        vector1[all_words.index(w)] += 1 
    for w in sent2: 
        if w in stopwords: 
            continue
         vector2[all_words.index(w)] += 1 
    return  1 -cosine_distance(vector1,vector2) 

def  gen_sim_matrix ( phrases,stop_words ): 
    similarity_matrix=np.zeros(( len (phrases), len (phrases))) 
    pour idx1 dans la  plage ( len (phrases)): 
        pour idx2 dans la  plage ( len (phrases)):
            if idx1 == idx2: 
                continue
             similarity_matrix[idx1][idx2]=sentance_similarity(sentances[idx1],sentances[idx2],stop_words) 

    return similarity_matrix 

def  generate_summary ( file_name,top_n= 5 ): 
    stop_words=stopwords.words( 'anglais' ) resume_text 
    =[] 
    phrases = read_article(nom_fichier) 
    matrix=gen_sim_matrix(phrases,stop_words) 
    graph=nx.from_numpy_array(matrice) 
    scores = nx.pagerank(graph) 
    ranking_sentance= trié (((scores[i],s) pour i ,s dans  énumérer (phrases)),reverse=True ) 
    pour i dans la  plage (top_n) : resume_text.append 
        ( " " .join(ranked_sentance[i][ 1 ])) 
    text = ". " .join(summarize_text)

RÉSUMÉ ABSTRACTIF

> SOMMAIRE 3

from models.summary_model.model import SummaryModel, 

tokenizertrained_model = SummaryModel.load_from_checkpoint( "models\\summary_model\\best-checkpoint.ckpt" ) 
trained_model.freeze() 

def  resume ( text ): 
  text_encoding = tokenizer( 
  text, 
  max_length= 512 , 
  padding= "max_length" , 
  truncation= Vrai , 
  return_attention_mask= Vrai , 
  add_special_tokens= Vrai , 
  return_tensors= "pt"
 ) 

  generate_ids =trained_model.model.generate(
    input_ids=text_encoding[ "input_ids" ], 
    attention_mask=text_encoding[ "attention_mask" ], 
    max_length= 150 , 
    num_beams= 2 , repeat_penalty 
    = 2.5 , 
    length_penalty= 1.0 , 
    early_stopping= True
   ) 

  preds = [ 
    tokenizer.decode(gen_id, skip_special_tokens= True , clean_up_tokenization_spaces= True ) 
    pour gen_id dans les id_générés 
  ] 

  return  "" .join(preds)

> À LA UNE

from simplet5 import SimpleT5 

def  generate_headline ( text ): 

    model = SimpleT5() 

    model.load_model( "t5" , "models\\headline_model" , use_gpu= False ) 

    headline = model.predict(text)[ 0 ] 

    return headline

PDF VERS TEXTE

pip installer pypdf2
def  pdf_to_text ( file ): 

    pdffile = open ( file , 'rb ' ) 

    pdfReader = PyPDF2 . PDFFileReader ( pdffile ) 

    num = pdfReader . numPages 

    for in in  range ( 0 , num ): 
        pageobj = pdfReader . getPage ( i ) 
        resulttext = pageobj .extractText() 
        newfile = open ( r"content.txt" , "w" ) 
        newfile.writelines(resulttext) 
    
    demo= open ( "content.txt" , "r" ) 
    str1=demo.read() 
    returnchaîne1

PRINCIPAL

import streamlit as st 
from io import StringIO 
from pdf_txt import pdf_to_text 
from headline import generate_headline 
from summaryr1 import resumer 
from summaryr2 import generate_summary 
from summaryr3 import resume 
from pathlib import Path 
import os 

st.markdown( "<h1 style='text-align: center; '>RÉSUMÉ DE TEXTE v2.0</h1>" , 
            unsafe_allow_html=Vrai ) 


file = st.file_uploader( "Veuillez choisir un fichier" , tapez =[ 'txt' , 'pdf' ]) 
st.markdown( "<h5 style='text-align: center;'>OU</h5> " , unsafe_allow_html= True ) 
text = st.text_area( "Texte d'entrée pour le résumé (plus de 200 mots)" , height= 200 ) 
col1, col2, col3 = st.columns( 3 ) 
if col1.button( 'SUMMARIZE' ) : 
    #try : 

        si le fichier n'est  pas  Aucun :
            si  bool (texte)== Vrai :
                st.error( "ERREUR : VOUS NE POUVEZ PAS ENTRER LES DEUX" ) 
                st.stop() 
            else : 
                if file.name[- 3 :] == "pdf" : 
                    path=Path( "uploaded_pdfs/" + file.name) 
                    path.write_bytes(file.getvalue()) 
                    text = pdf_to_text( "uploaded_pdfs/" + file.name) 

                else : 
                    stringio = StringIO(file.getvalue().decode( "utf-8" )) 
                    text=stringio.read( ) 
    
        fichier texte = ouvrir ( "contenu.txt" ,"w") 
        textfile.write(text) 
        textfile.close() 
        headline=generate_headline(text) 
        summary1=summarizer(text) 
        summary2=generate_summary( "content.txt" ) 
        summary3=summarize(text) 
        st.write( "" ) 
        st.subheader (titre) 
        st.markdown( "<h4> > Résumé 1 : </h4>" , unsafe_allow_html= True )                                 
        st.write(summary1) 
        st.markdown( "<h4> > Résumé 2 : </h4>" , unsafe_allow_html = Vrai ) 
        st.écrire (résumé2) 
        st.markdown ("<h4> > Résumé 3 : </h4>" , unsafe_allow_html= Vrai ) 
        st.write(summary3)

CONCLUSION

Retour en haut