From c587f0b66ae75a2a8f1c6cad30b492cbeb2dfc8e Mon Sep 17 00:00:00 2001 From: Runxi Yu Date: Sun, 16 Jun 2024 15:57:05 +0000 Subject: Add everything related to past papers searching --- pdf_read_test.py | 7 +++ read_past_papers.py | 27 ++++++++++ search_past_papers.py | 17 +++++++ search_past_papers_advanced.py | 113 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 164 insertions(+) create mode 100644 pdf_read_test.py create mode 100644 read_past_papers.py create mode 100644 search_past_papers.py create mode 100644 search_past_papers_advanced.py diff --git a/pdf_read_test.py b/pdf_read_test.py new file mode 100644 index 0000000..d879362 --- /dev/null +++ b/pdf_read_test.py @@ -0,0 +1,7 @@ +import PyPDF2 as pdf + +f = open("0460_s22_ms_11.pdf", "rb") +r = pdf.PdfReader(f) + +for i in range(len(r.pages)): + print(r.pages[i].extract_text(), "\n") diff --git a/read_past_papers.py b/read_past_papers.py new file mode 100644 index 0000000..b7870a3 --- /dev/null +++ b/read_past_papers.py @@ -0,0 +1,27 @@ +import os +import re +import json +import PyPDF2 as pdf + +path = "/Users/albert/Desktop/YK Pao School/Y9/Others/IGCSE Past Papers" + +for subject in sorted(os.listdir(path)): + if "." not in subject: + print(f"Starting to read {subject} ...") + subject_code = subject.split()[-1].replace("(", "").replace(")", "") + dic = {} + for year in sorted(os.listdir(path+"/"+subject)): + if "." not in year: + dic[year] = {} + for file in sorted(os.listdir(path+"/"+subject+"/"+year)): + if re.match("\d\d\d\d_\w\d\d_qp_\d\d?.pdf", file): + print(f"Reading ./{subject}/{year}/{file} ...") + dic[year][file] = {} + r = pdf.PdfReader(open(path+"/"+subject+"/"+year+"/"+file, "rb")) + for page in range(len(r.pages)): + dic[year][file][str(int(page)+1)] = r.pages[page].extract_text().lower().replace("\n", " ").replace("....", "") + print(f"Creating {subject_code}.json ...") + with open(f"{subject_code}.json", "w") as f: + json.dump(dic, f, indent=4) + print(f"Created {subject_code}.json") +print("Process completed") diff --git a/search_past_papers.py b/search_past_papers.py new file mode 100644 index 0000000..a053226 --- /dev/null +++ b/search_past_papers.py @@ -0,0 +1,17 @@ +import json + +subject_code = input("Subject code: ") +dic = json.load(open(f"{subject_code}.json")) +search_str = input("Word or words to be searched: ").lower() +flag = True +print("\nThe following are files that contain such string: ") + +for year in dic.keys(): + for file in dic[year].keys(): + for page in dic[year][file].keys(): + if search_str in dic[year][file][page]: + print(file, f"[Page {page}]") + flag = False + +if flag: + print("No file found") diff --git a/search_past_papers_advanced.py b/search_past_papers_advanced.py new file mode 100644 index 0000000..45deca0 --- /dev/null +++ b/search_past_papers_advanced.py @@ -0,0 +1,113 @@ +import os +import json + +dic = {"Empty": True} +path = r"/Users/albert/Documents/IGCSE_Past_Papers" +code = "Empty" +files = ["Empty"] + +help_doc = '''Help on module 'search_past_papers': +--- set + > set (int)code + Accepts a valid 4-digit integer subject code and loads corresponding json. + Returns a dictionary from the json file that can be later used in 'print'. +--- list + > list *args + Accepts multiple arguments that will be concatenated into a string. + Returns all past papers that contain the string. +--- open + > open (char)option + Accepts a character indicating which file type to be opened. + - q: question paper only + - m: corresponding mark scheme only + - b: both question paper and corresponding mark scheme + Opens all files of the indicated type returned from the last 'list'. +--- help + > help + Returns a help document. +--- quit + > quit + Leaves the program properly and directly. ''' + + +def subject(code): + dic = json.load(open(f"{code}.json")) + return dic + + +def search(dic, string): + flag = True + files = [] + for year in dic.keys(): + for file in dic[year].keys(): + for page in dic[year][file].keys(): + if string in dic[year][file][page]: + print(file, f"[Page {page}]") + files.append(file) + flag = False + if flag: + print("Warning: no file found, unable to use 'open' later") + files = ["Empty"] + return files + + +try: + while True: + action = input(">>> ") + action_l = action.split() + + if len(action_l) <= 0: + print("Invalid syntax: no command entered") + + elif action_l[0] == "set": + if len(action_l) == 2: + if len(action_l[1]) == 4: + try: + code = action_l[1] + dic = subject(code) + except FileNotFoundError: + print("Execution failed: subject not supported, try a different code") + else: + print(f"Execution failed: 'set' does not except argument '{action_l[1]}'") + else: + print(f"Execution failed: 'set' expects 1 argument, gets {len(action_l)-1}") + + elif action_l[0] == "list": + if len(action_l) >= 2: + if "Empty" not in dic.keys(): + files = search(dic, " ".join(action_l[1:]).lower()) + else: + print("Execution failed: no subject selected, use 'set' first") + else: + print("Execution failed: 'list' expects at least 1 argument, gets 0") + + elif action_l[0] == "help": + print(help_doc) + + elif action_l[0] == "quit": + print("Process completed") + break + + elif action_l[0] == "open": + if len(action_l) == 2: + if code != "Empty" and "Empty" not in files: + if action_l[1] == "m" or action_l[1] == "b": + for file in files: + os.system(f"open {path}/{code}/20{file[6:8]}/{file[:9]}ms{file[11:]}") + if action_l[1] == "q" or action_l[1] == "b": + for file in files: + os.system(f"open {path}/{code}/20{file[6:8]}/{file}") + elif action_l[1] not in ["q", "m", "b"]: + print(f"Execution failed: 'open' does not except argument '{action_l[1]}'") + elif code == "Empty": + print("Execution failed: no subject selected, use 'set' first") + else: + print("Execution failed: last 'list' finds no file") + else: + print(f"Execution failed: 'open' expects 1 argument, gets {len(action_l)-1}") + + else: + print(f"Invalid syntax: command '{action_l[0]}' not found") + +except KeyboardInterrupt: + print("Process exited") -- cgit v1.2.3