import pandas as pd import numpy as np import csv # Original data with most statistics # scraped_data = pd.read_csv("./2024-01-17-donnéesparcoursup.csv", delimiter="#") # # Downloaded CSV with info such as GPS location # downloaded_data = pd.read_json("./fr-esr-parcoursup.json") # # Scraping results including program description and job prospects description # description_data = pd.read_csv( # "./2024-01-20-infoparcoursup.csv", # delimiter="$", # quoting=3, # engine="python", # encoding="UTF-8", # ) # # Combine the first two documents # merged_data = pd.merge(scraped_data, downloaded_data, on="cod_aff_form") # # Add the third document # all_data = pd.merge(merged_data, description_data, on="cod_aff_form") # # Select only the necessary columns # selected_columns = [ # "cod_aff_form", # "lib_for_voe_ins", # "select_form", # "g_olocalisation_des_formations", # "ville_etab", # "cod_uai", # "place_dispo", # "taux_acces", # "nombre_voeux", # "pourcentage_boursiers", # "candidats_hors_secteur", # "taux_passage_L2", # "taux_diplome_temps_prevu", # "description", # "job_prospects", # "contrat_etab", # "g_ea_lib_vx", # ] # selected_data = ( # all_data[selected_columns].drop_duplicates(subset=["cod_aff_form"]).copy() # ) # # Rename columns to clearer names # selected_data = selected_data.rename( # columns={ # "lib_for_voe_ins": "program_name", # "select_form": "selectivity", # "g_olocalisation_des_formations": "geolocation", # "ville_etab": "city", # "place_dispo": "available_places", # "taux_acces": "percent_admitted", # "nombre_voeux": "number_applicants", # "pourcentage_boursiers": "percent_scholarship", # "candidats_hors_secteur": "out_of_sector_candidates", # "taux_passage_L2": "L2_continuation_rate", # "taux_diplome_temps_prevu": "diploma_earned_ontime", # "contrat_etab": "school_type", # "g_ea_lib_vx": "school_name", # } # ) # selected_data["L2_continuation_rate"] = ( # selected_data["L2_continuation_rate"] # .str.replace(",", ".") # .str.replace("ns", "NaN") # .astype("float") # ) # # Write the result to JSON, note that we will need to manually convert it to a list after export # selected_data.to_json("./study_program_data.json", orient="records", lines=True) # print("json created") selected_data = pd.read_json("./study_program_data.json", encoding="UTF-8") stat_columns = [ "percent_admitted", "percent_scholarship", "out_of_sector_candidates", "L2_continuation_rate", "diploma_earned_ontime", ] # selected_data[stat_columns].replace(to_replace=["None"], value=np.nan, inplace=True) stats = selected_data[stat_columns].quantile(q=[0.25, 0.5, 0.75]).transpose() # print(selected_data["L2_continuation_rate"].dtypes) print(stats) stats.to_csv("studyprogram_stats.csv", index=True, sep="#") print("CSV created") # # diploma_earned_ontime = [47.6,58.7,66.8] # # percent_scholarship = [12.0,17.0,23.0] # # out_of_sector_candidates=[18.0,40.0,50.0] # # diploma_earned_ontime=[47.6,58.7,66.8] # # use these stats to set quartile on object creation