Creates a data a four columns:
#### dependencies ####
import pandas as pd
import numpy as np
import os
import json
import datetime
#### Navigate to the your corvid 19 download file #####
%cd "C:\Users\jwr17\Desktop\CORD-19-research-challenge"
#### understanding the file system ######
file_paths = ["biorxiv_medrxiv\\biorxiv_medrxiv", "comm_use_subset\\comm_use_subset", "custom_license\\custom_license", "noncomm_use_subset\\noncomm_use_subset"]
i=0
for path in file_paths:
print(f"file folder {path} has a total of {str(len(os.listdir(path)))} files")
i = i + len(os.listdir(path))
print(f"\n total files are equal to {str(i)}")
########################################################### reading in the file Loop #################################################################################
master_text=[]
i=0
count = 0
file_paths = ["biorxiv_medrxiv\\biorxiv_medrxiv", "comm_use_subset\\comm_use_subset", "custom_license\\custom_license", "noncomm_use_subset\\noncomm_use_subset"]
for path in file_paths:
for file in os.listdir(path): ### cycles through every json file from a given file path
try:
with open(path + "\\" +file, "r") as read_file: ### reading in json file
data = json.load(read_file)
read_file.close()
paper_id = data['paper_id']
papepr_title = data['metadata']['title']
##### geting abstract #######
abs_text = ""
for main in data['abstract']:
abs_text = abs_text + ' '+ main['text']
abs_text
#### getting body text ####
main_text = ""
for main in data['body_text']:
main_text = main_text + ' '+ main['text']
### adding to list ###
master_text.append({'paper_id': paper_id,'papepr_title': papepr_title, 'abstract': abs_text, 'main_text': main_text, "file_path": re.findall(r".+\\", path)[0][:-2]})
except:
i+=1
print(f"total skiped: {str(i)}")
count+=1
if count%500 == 0: f"complted {str(count)} text files as of {str(datetime.datetime.now())}"
master_text = pd.DataFrame(master_text)
master_text
%cd "C:\Users\jwr17\Desktop\CORD-19-research-challenge\my_code_dfs"
master_text.to_csv("master_text.csv")
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%cd "C:\Users\jwr17\Desktop\CORD-19-research-challenge\my_code_dfs"
master_test = pd.read_csv('master_text.csv')
meta = pd.read_csv("C:\\Users\\jwr17\\Desktop\\CORD-19-research-challenge\\metadata.csv")
### papers that are not in json froma
meta[~meta.sha.isin(master_test.paper_id)]