summaryrefslogtreecommitdiff
path: root/read_past_papers.py
diff options
context:
space:
mode:
authorAlbert Tan <s22505@ykpaoschool.cn>2023-04-13 20:37:04 +0800
committerAlbert Tan <s22505@ykpaoschool.cn>2023-04-13 20:37:04 +0800
commit6517a9a2c41f1b0f131b0518d5111291691f1add (patch)
treef53c2a5e0cfaf602560b4ce15e12274054631fd2 /read_past_papers.py
downloadsearch_past_papers-master.tar.gz
search_past_papers-master.zip
squish albertHEADmaster
Diffstat (limited to 'read_past_papers.py')
-rw-r--r--read_past_papers.py27
1 files changed, 27 insertions, 0 deletions
diff --git a/read_past_papers.py b/read_past_papers.py
new file mode 100644
index 0000000..b7870a3
--- /dev/null
+++ b/read_past_papers.py
@@ -0,0 +1,27 @@
+import os
+import re
+import json
+import PyPDF2 as pdf
+
+path = "/Users/albert/Desktop/YK Pao School/Y9/Others/IGCSE Past Papers"
+
+for subject in sorted(os.listdir(path)):
+ if "." not in subject:
+ print(f"Starting to read {subject} ...")
+ subject_code = subject.split()[-1].replace("(", "").replace(")", "")
+ dic = {}
+ for year in sorted(os.listdir(path+"/"+subject)):
+ if "." not in year:
+ dic[year] = {}
+ for file in sorted(os.listdir(path+"/"+subject+"/"+year)):
+ if re.match("\d\d\d\d_\w\d\d_qp_\d\d?.pdf", file):
+ print(f"Reading ./{subject}/{year}/{file} ...")
+ dic[year][file] = {}
+ r = pdf.PdfReader(open(path+"/"+subject+"/"+year+"/"+file, "rb"))
+ for page in range(len(r.pages)):
+ dic[year][file][str(int(page)+1)] = r.pages[page].extract_text().lower().replace("\n", " ").replace("....", "")
+ print(f"Creating {subject_code}.json ...")
+ with open(f"{subject_code}.json", "w") as f:
+ json.dump(dic, f, indent=4)
+ print(f"Created {subject_code}.json")
+print("Process completed")