if args.local_rank not in [-1, 0] and not evaluate:
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
... (preprocesses the data and save the preprocessed data)
if args.local_rank == 0 and not evaluate:
torch.distributed.barrier()