Add everything related to past papers searching

author: Runxi Yu <me@runxiyu.org> 2024-06-16 15:57:05 +0000
committer: Runxi Yu <me@runxiyu.org> 2024-06-16 15:57:05 +0000
commit: c587f0b66ae75a2a8f1c6cad30b492cbeb2dfc8e (patch)
tree: 2dd4ac678b4c696553862cbbd95a971591ae3fb1 /read_past_papers.py
parent: f92bab5c8e2fe3829f8f85bb9e039db2ec2b01de (diff)
download: albertscripts-c587f0b66ae75a2a8f1c6cad30b492cbeb2dfc8e.tar.gz
albertscripts-c587f0b66ae75a2a8f1c6cad30b492cbeb2dfc8e.zip
1 files changed, 27 insertions, 0 deletions
diff --git a/read_past_papers.py b/read_past_papers.py
new file mode 100644
index 0000000..b7870a3
--- /dev/null
+++ b/read_past_papers.py
@@ -0,0 +1,27 @@
+import os
+import re
+import json
+import PyPDF2 as pdf
+
+path = "/Users/albert/Desktop/YK Pao School/Y9/Others/IGCSE Past Papers"
+
+for subject in sorted(os.listdir(path)):
+    if "." not in subject: 
+        print(f"Starting to read {subject} ...")
+        subject_code = subject.split()[-1].replace("(", "").replace(")", "")
+        dic = {}
+        for year in sorted(os.listdir(path+"/"+subject)):
+            if "." not in year: 
+                dic[year] = {}
+                for file in sorted(os.listdir(path+"/"+subject+"/"+year)):
+                    if re.match("\d\d\d\d_\w\d\d_qp_\d\d?.pdf", file):
+                        print(f"Reading ./{subject}/{year}/{file} ...")
+                        dic[year][file] = {}
+                        r = pdf.PdfReader(open(path+"/"+subject+"/"+year+"/"+file, "rb"))
+                        for page in range(len(r.pages)): 
+                            dic[year][file][str(int(page)+1)] = r.pages[page].extract_text().lower().replace("\n", " ").replace("....", "")
+        print(f"Creating {subject_code}.json ...")
+        with open(f"{subject_code}.json", "w") as f:
+            json.dump(dic, f, indent=4)
+        print(f"Created {subject_code}.json")
+print("Process completed")
author	Runxi Yu <me@runxiyu.org>	2024-06-16 15:57:05 +0000
committer	Runxi Yu <me@runxiyu.org>	2024-06-16 15:57:05 +0000
commit	c587f0b66ae75a2a8f1c6cad30b492cbeb2dfc8e (patch)
tree	2dd4ac678b4c696553862cbbd95a971591ae3fb1 /read_past_papers.py
parent	f92bab5c8e2fe3829f8f85bb9e039db2ec2b01de (diff)
download	albertscripts-c587f0b66ae75a2a8f1c6cad30b492cbeb2dfc8e.tar.gz albertscripts-c587f0b66ae75a2a8f1c6cad30b492cbeb2dfc8e.zip