adding assignment template files

b700066b · Nikit Srivastava · b700066b · b700066b · b700066b · b700066b
Commit b700066b authored 2 months ago by Nikit Srivastava
--- a/README.md
+++ b/README.md
+<!-- Its a good practice to have a README file in your project folder. This file should contain information about the project, how to install it and run it etc. -->
\ No newline at end of file
--- a/environment_initialization.sh
+++ b/environment_initialization.sh
+#!/bin/bash
+
+# This script is used to initialize the environment for the project.
\ No newline at end of file
--- a/prepare.sh
+++ b/prepare.sh
+#!/bin/bash
+
+echo A total of $N_PROCS processes are going to be spawned. You can edit this script to perform any operation before those processes are spawned.
\ No newline at end of file
--- a/preprocess_dataset.sh
+++ b/preprocess_dataset.sh
+#!/bin/bash
+
+export N_PROCS=20000 # Number of total processes to spawn. This is just an example, your utility should be ready to scale up or down when this value is changed.
+
+source environment_initialization.sh
+
+# Call to some preparation logic (optional)
+bash prepare.sh
+
+# Spawner calling your preprocessing script N_PROCS times across the computing cluster
+super_duper_process_spawner -n N_PROCS python text_preprocessor.py
+
+echo "Finishing preprocessing data."
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
+# list of all the packages required for this project
\ No newline at end of file
--- a/setup.sh
+++ b/setup.sh
+#!/bin/bash
+
+# This script is used to set up the environment for running your code.
\ No newline at end of file
--- a/text_preprocessor.py
+++ b/text_preprocessor.py
+
+OUTPUT_FILE_TEMPLATE = "/shared-file-storage/preprocessed_data/preprocessed_{rank}.txt"
+
+def preprocess_text(text):
+    return text.lower().strip().split()
+
+def load_dataset(*args, **kwargs):
+    # TODO: There should be some logic about loading the dataset here. As we only have 2 Gigabytes of memory available per process, maybe that should play a role as well ;)
+    raise NotImplementedError("This function has not been implemented yet.")
+
+def write_preprocessed_text(preprocessed_text, rank):
+    with open(OUTPUT_FILE_TEMPLATE.format(rank=rank), "w") as f:
+        for line in preprocessed_text:
+            f.write("\t".join(line))
+            f.write("\n")
+
+def main():
+    # TODO: Here you should try to check the rank of this process and the total number of processes that are spawned
+    local_rank = None # TODO: This should be the rank of the process
+    total_procs = None # TODO: This should be the total number of processes that are spawned
+    # Extract the text to process
+    text_to_process = load_dataset() # TODO: pass the relevant arguments (if any)
+    # Preprocess the text
+    preprocessed_text = [preprocess_text(text) for text in text_to_process]
+    # TODO: Write this somewhere
+    write_preprocessed_text(preprocessed_text, rank=local_rank)
+    raise NotImplementedError("This function has not been fully implemented yet.")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file