In [8]:
import pandas as pd 
import requests 
import datetime as dt
import re
import json
from tqdm import tqdm
import os
import time
from openai import OpenAI

#### Calculate

In [9]:
if "OPENAI_API_KEY" not in os.environ:
    with open('/home/sagemaker-user/Sciences-POC/config/secrets/keys.txt', 'r') as f:
        keys = json.loads(f.read())
else : 
    keys=os.environ

In [27]:
save_path = 'data/outputs'
content_path = 'data/extract_sciences_po'


def retrieve_classifications(name, mapping_prompt):

    df = pd.read_csv('data/extract_sciences_po.csv')

    if os.path.exists(f"{save_path}/output_{name}.txt"):
        with open(f"{save_path}/output_{name}.txt", 'r') as f : 
            out_dict = json.loads(f.read())
        out_df = pd.DataFrame.from_dict(out_dict)
        out = out_dict
    else : 
        out_df = pd.DataFrame(columns = ['item_id', 'categorie_principale', 'categorie_secondaire'])
        out = []

    df_to_process = df.loc[~df.item_id.isin(out_df.item_id)]

    if mapping_prompt[name]['client']=='deepseek':
        client = OpenAI(api_key=keys["DEEPSEEK_API_KEY"], base_url="https://api.deepseek.com")
        model="deepseek-chat"
    else:
        client=OpenAI(api_key="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")
        model="gpt-4o-mini"

    with open(mapping_prompt[name]['path_prompt'], 'r') as f:
        prompt = f.read()

    if mapping_prompt[name]['client']=='openai-assistant':
        
            assistant = client.beta.assistants.create(
            name="News classifier",
            instructions=prompt,
            response_format={ "type": "json_object"},
            model="gpt-4o-mini",
            )

            assistant_id = assistant.id #mapping_prompt[name]['assistant_id']

    with tqdm(total=df_to_process.shape[0]) as pbar:

        for i, row in df_to_process.iterrows():
            titre_brut = f"{row.item_id}_"+row.titre.lower().strip().replace(f"\xa0", ' ').replace(' : ', ':').replace(' ', '_').replace('/', '')
            
            with open(f'{content_path}/{titre_brut}.txt', 'r') as f:
                text = f.read()

            if mapping_prompt[name]['client']=='openai-assistant':
                
                # Step 1: Create a thread
                thread = client.beta.threads.create()

                # Step 2: Add a user message
                client.beta.threads.messages.create(
                    thread_id=thread.id,
                    role="user",
                    content=text
                )

                # Step 3: Run the assistant
                run = client.beta.threads.runs.create(
                    thread_id=thread.id,
                    assistant_id=assistant_id,
                )

                # Step 4: Wait for completion
                while True:
                    run = client.beta.threads.runs.retrieve(
                        thread_id=thread.id,
                        run_id=run.id,
                    )
                    if run.status == "completed":
                        break
                    elif run.status in ["failed", "cancelled", "expired"]:
                        raise Exception(f"""Run failed with status: {run.status}\n
                                        {run}""")
                    time.sleep(1)

                # Step 5: Get last assistant message only
                messages = client.beta.threads.messages.list(thread_id=thread.id)
                assistant_messages = [m for m in messages.data if m.role == "assistant"]

                if assistant_messages:
                    # Get the most recent assistant message
                    latest = assistant_messages[0]
                    content = latest.content[0].text.value                   
                    
            else:
                messages = [{"role": "system", "content": prompt},
                            {"role": "user", "content": text}]

                response = client.chat.completions.create(
                    model=model,
                    messages=messages,
                    response_format={
                        'type': 'json_object'
                    }
                )
                content = response.choices[0].message.content
            try : 
                cat_json = json.loads(content)

                out.append({
                    'item_id':row.item_id, 
                    'categorie_principale': cat_json['categorie_principale'],
                    'categorie_secondaire': cat_json['categorie_secondaire'],
                })
                
                with open(f'{save_path}/output_{name}.txt', 'w+') as f : 
                    f.write(json.dumps(out))

            except Exception as e : 
                print(f'Error with article {row.item_id}')
                pass

                
            pbar.update(1)



In [28]:
with open('config/mapping_prompts.txt', 'r') as f : 
    mapping = json.loads(f.read())

for name in mapping.keys():
    print(name)
    retrieve_classifications(name, mapping)

dimanov_et_al


  6%|▌         | 29/509 [03:56<1:21:08, 10.14s/it]

Error with article F5TBC6SGHRGRFJGZXZYG73I2C4


 13%|█▎        | 65/509 [09:58<1:19:31, 10.75s/it]

Error with article PCZQ5Q2SHJFHZANPXZW2CUQVWU


 15%|█▍        | 75/509 [10:54<39:59,  5.53s/it]  

Error with article KCFRPP6YOJEMZMUEP55JMHKPCI


 16%|█▌        | 81/509 [11:28<39:46,  5.58s/it]

Error with article AQ3WGXNGMFC7TF5NO73LD2AQBY


 17%|█▋        | 87/509 [12:09<46:30,  6.61s/it]

Error with article KAO3B4ZMY5HP3NMHSSORBBHEVQ


 20%|█▉        | 101/509 [14:32<2:06:30, 18.60s/it]

Error with article OKZW3KTDFNHTDIOZSJTUUWTECM


 21%|██        | 106/509 [15:06<58:56,  8.78s/it]  

Error with article UBNN4MNV35BCVICDFZVGQEAPYU


 21%|██        | 108/509 [15:21<53:29,  8.00s/it]  

Error with article VWT2DL4B2RESPHBN5GLSRSKCXA


 21%|██▏       | 109/509 [15:26<46:12,  6.93s/it]

Error with article NE7X4E4S7BGCVN42KSQHYGPR5M


 28%|██▊       | 140/509 [20:16<43:06,  7.01s/it]  

Error with article 7MSP4B5UWFDURJY2ZA4E26D6Y4


 28%|██▊       | 141/509 [20:21<39:19,  6.41s/it]

Error with article 5GQ4EE3DBZBJ3FOP2J3FHRAW6Y


 35%|███▌      | 180/509 [25:47<45:26,  8.29s/it]  

Error with article FOMV5KH4UZBFNJHHOTLIRNBTNA


 36%|███▌      | 182/509 [25:58<36:52,  6.77s/it]

Error with article KXP2DEGANVHOXNRBFFU2X44KME


 39%|███▊      | 197/509 [28:03<50:13,  9.66s/it]  

Error with article HIY7OM5AGFF7NAWUFKDSFKBIQU


 40%|███▉      | 202/509 [29:15<1:19:59, 15.64s/it]

Error with article GK5PBHMGJNGH7IC6TRV7BPISRI


 46%|████▌     | 235/509 [33:50<26:15,  5.75s/it]  

Error with article WHQXZPRU4ZGNHHFODI5AF4QREQ


 48%|████▊     | 245/509 [35:12<39:31,  8.98s/it]

Error with article JMNDPKJDOBGDPOVOT3B2TMEI3Q


 53%|█████▎    | 272/509 [40:57<28:50,  7.30s/it]  

Error with article OY5WOZYJJFCUXAY2IP3MDF5DBI


 54%|█████▍    | 276/509 [41:21<24:21,  6.27s/it]

Error with article 4UXLV4RIYRGI3LLOJ4VIFIS3PU


 56%|█████▌    | 286/509 [43:40<43:09, 11.61s/it]  

Error with article BDUEDA6Q5VFA5JVZUYKANSBEJU


 58%|█████▊    | 297/509 [45:06<30:24,  8.60s/it]

Error with article LTW4AQF5ZJFA5IYGRMTYK5KYYA


 74%|███████▎  | 375/509 [56:16<19:53,  8.90s/it]

Error with article TYDKDANLIVFMXMGT6QNNLZXFT4


 74%|███████▍  | 376/509 [56:22<17:48,  8.03s/it]

Error with article B62VZRCINRBWTI2ZP5KSJCALQY


 76%|███████▋  | 389/509 [58:22<17:08,  8.57s/it]

Error with article NEIWXLDUOBFG5P7N3QJFXKRWKU


 77%|███████▋  | 391/509 [58:38<15:53,  8.08s/it]

Error with article Y4SGMAXQGJFA3EIXQM2P4ULYAA


 77%|███████▋  | 392/509 [58:49<17:23,  8.92s/it]

Error with article 2GHECSGTINFFLLDNFB6GWBWT74


 82%|████████▏ | 416/509 [1:03:07<16:25, 10.59s/it]

Error with article A76MZTTKFFDNTEBAHEFQR3YMWA


 82%|████████▏ | 418/509 [1:03:34<17:11, 11.33s/it]

Error with article RVAAO6SOURFWXLYT5UGAZQMFDI


 84%|████████▎ | 426/509 [1:04:50<14:42, 10.63s/it]

Error with article ZPJYQHA4YZA7XJVVYINHDXY52Y


 84%|████████▍ | 429/509 [1:06:26<32:25, 24.32s/it]

Error with article HQWSHI5H3ZFWRC6CR3EGK2CWOU


 85%|████████▍ | 431/509 [1:06:40<19:53, 15.30s/it]

Error with article UJCAG7SOBRC4DB4GD3YRYKHYJE


 91%|█████████ | 462/509 [1:13:31<10:16, 13.12s/it]

Error with article CSVWJ7KVPBHLPH4LGTSWPYA5IE


 95%|█████████▌| 484/509 [1:17:49<04:09,  9.96s/it]

Error with article BQ6E3KG74ZFQPEHRYVAUUDLTRY


 97%|█████████▋| 492/509 [1:18:53<01:56,  6.83s/it]

Error with article VC2YC2LPWRA2ZGM6DM3JWZKVHY


 99%|█████████▉| 505/509 [1:21:06<00:41, 10.36s/it]

Error with article AOT254SA2VDIDNF4YW7XPLWJ5E


100%|██████████| 509/509 [1:22:17<00:00,  9.70s/it]

sans_titre_1





KeyError: 'DEEPSEEK_API_KEY'

In [38]:
articles = pd.read_csv('data/extract_sciences_po.csv')

with open("data/outputs/output_favarel_et_al.txt", 'r') as f : 
    out_dict = json.loads(f.read())


df = pd.DataFrame.from_dict(out_dict)

articles = pd.merge(df, articles, on='item_id', how='left')

count_principale = df.groupby('categorie_principale').item_id.count()
df['categorie_secondaire'] = df.apply(lambda x : x.categorie_secondaire.split(',')[0] if x.categorie_secondaire!=None else None, axis=1)

    

In [40]:
df

Unnamed: 0,item_id,categorie_principale,categorie_secondaire
0,I4OEKQ6MHRBP3LQVVYDDXW6T6U,UPDATE ME,EDUCATE ME
1,LVA4MZBQOBECNPZD323NV6O7K4,INSPIRE ME,EDUCATE ME
2,4FAEHUUZ5ZFAJKLFEV2LT5CBAQ,EDUCATE ME,GIVE ME PERSPECTIVE
3,4S4G6BKFRNER3LB22CLPAEWWKY,GIVE ME PERSPECTIVE,INSPIRE ME
4,ZAFHRNAHJVC6THXRSBMCB4A24I,INSPIRE ME,EDUCATE ME
...,...,...,...
511,AOT254SA2VDIDNF4YW7XPLWJ5E,INSPIRE ME,ENTERTAIN ME
512,GUOUKHLPFZBK7GVR5XU7MXVD5A,INSPIRE ME,EDUCATE ME
513,5HT6C24ZBVDOBFXPLA4HNVOTT4,EDUCATE ME,UPDATE ME
514,VLV6RSQ6U5E6XJ6AIRV26AEKO4,UPDATE ME,EDUCATE ME


#### Ajouter images