From c587f0b66ae75a2a8f1c6cad30b492cbeb2dfc8e Mon Sep 17 00:00:00 2001 From: Runxi Yu Date: Sun, 16 Jun 2024 15:57:05 +0000 Subject: Add everything related to past papers searching --- read_past_papers.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 read_past_papers.py (limited to 'read_past_papers.py') diff --git a/read_past_papers.py b/read_past_papers.py new file mode 100644 index 0000000..b7870a3 --- /dev/null +++ b/read_past_papers.py @@ -0,0 +1,27 @@ +import os +import re +import json +import PyPDF2 as pdf + +path = "/Users/albert/Desktop/YK Pao School/Y9/Others/IGCSE Past Papers" + +for subject in sorted(os.listdir(path)): + if "." not in subject: + print(f"Starting to read {subject} ...") + subject_code = subject.split()[-1].replace("(", "").replace(")", "") + dic = {} + for year in sorted(os.listdir(path+"/"+subject)): + if "." not in year: + dic[year] = {} + for file in sorted(os.listdir(path+"/"+subject+"/"+year)): + if re.match("\d\d\d\d_\w\d\d_qp_\d\d?.pdf", file): + print(f"Reading ./{subject}/{year}/{file} ...") + dic[year][file] = {} + r = pdf.PdfReader(open(path+"/"+subject+"/"+year+"/"+file, "rb")) + for page in range(len(r.pages)): + dic[year][file][str(int(page)+1)] = r.pages[page].extract_text().lower().replace("\n", " ").replace("....", "") + print(f"Creating {subject_code}.json ...") + with open(f"{subject_code}.json", "w") as f: + json.dump(dic, f, indent=4) + print(f"Created {subject_code}.json") +print("Process completed") -- cgit v1.2.3