Skip to content
Snippets Groups Projects
Commit 0a2171b0 authored by joeld's avatar joeld
Browse files

added commetns

parent 1f6f3a8e
No related branches found
No related tags found
No related merge requests found
......@@ -17,6 +17,7 @@ class DataLoader:
os.makedirs(self.output_dir, exist_ok=True)
def partition_data(self):
""" this partitions files among processes and writes assigned data to a shard file for better testing/debugging """
master_list = get_master_file_list(self.input_dir)
assigned_files = [
file_path for idx, file_path in enumerate(master_list)
......
......@@ -28,6 +28,7 @@ def main():
lines_count, tokens_count = processor.process()
processing_time = time.time() - start_processing
# minimal processing statistics/timing for eval
total_time = time.time() - start_total
avg_tokens = tokens_count / lines_count if lines_count else 0
print(f"Proc {args.proc_rank}: Partition {partition_time:.2f}s, Processing {processing_time:.2f}s, "
......
......@@ -12,6 +12,7 @@ class ShardProcessor:
self.batch_size = batch_size
def process(self):
""" preprocesses text which is currently only splitting + lowering text, writes tokenized batches to the output file"""
lines_count = 0
tokens_count = 0
try:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment