Source code for reorganize_multiprocessing

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
The `REORGANIZE` module restructures data within a folder to prepare it for further processing with img2radiomics. This module organizes the data in a consistent format, ensuring compatibility with other processing modules.

**Note**: It’s recommended to run the `CHECK_FOLDER` module first to verify the folder structure before attempting reorganization. This helps avoid errors if the folder is incorrectly structured.

Options
-------

The `REORGANIZE` module accepts the following options:

- **verbose**: Enables verbose mode, providing additional details during execution (default: False).
- **timer**: Enables the timer to measure execution time (default: False).
- **inputFolder**: Specifies the path to the folder containing subfolders with DICOM or NIfTI files to reorganize.
- **outputFolder**: Specifies the path where the reorganized data will be saved. If omitted, `inplace` must be set to True.
- **outputFolderSuffix**: Adds a suffix to the name of `inputFolder` to create an output folder with a modified name (useful if `outputFolder` is not specified).
- **inplace**: Directly modifies the input folder instead of creating a new one. **Warning**: Use with caution, as any failure during reorganization may corrupt the data.
  
  .. warning::

      If the reorganization fails, `inplace` mode may corrupt the data in the input folder.

- **with-segmentation**: Indicates whether some subfolders contain segmentation files (default: True).
- **all-data-with-segmentation**: Specifies that all subfolders contain segmentation files (default: True).
- **log**: Specifies the path to a log file for recording detailed output information.
- **new_log_file**: Overwrites any existing log file with the same name.
- **skip**: Specifies a file containing a list of subfolders to exclude from reorganization.
- **include**: Specifies a file containing a list of subfolders to include in reorganization.
- **multiprocessing**: Specifies the number of CPU cores to use for parallel processing.

Example Usage
-------------

The following example demonstrates how to use the `REORGANIZE` module to restructure a study folder:

.. code-block:: bash

    REORGANIZE:
    {
        inputFolder: /path/to/DICOM_folder
        outputFolderSuffix: ready
        inplace: False
        log: /path/to/logs/reorganize.log
    }

In this example:

- The folder `/path/to/DICOM_folder` will be reorganized, and the output folder will be named `DICOM_folder_ready`.
- Data will be saved to a new folder (inplace is set to `False`).
- A log of the operation will be saved at `/path/to/logs/reorganize.log`.

"""


#The `reorganize_multiprocessing.py` script restructures folders containing DICOM files to match the directory layout expected by `dcm2nii.py`. This ensures that each patient’s data is organized consistently, allowing for streamlined processing in the image analysis pipeline.
#
#### Input Structure
#The input directory should have the following structure:
#
#    Folder
#     ├── Patient #1
#     │   ├── Sub-analysis #1
#     │   │   ├── #############.dcm
#     │   │   ├── #############.dcm
#     │   │   └── ...
#     │   ├── Sub-analysis #2
#     │   │   ├── #############.dcm
#     │   │   └── ...
#     │   └── ...
#     ├── Patient #2
#     │   └── ...
#     └── Patient #k
#
### Output Structure for `dcm2nii.py`
#After reorganization, the structure will look like this:
#
#    Folder
#     ├── Patient #1
#     │   ├── Sub-analysis #1
#     │   │   ├── DCM
#     │   │   │   ├── #############.dcm
#     │   │   │   └── ...
#     │   │   └── RTSTRUCT.dcm
#     │   ├── Sub-analysis #2
#     │   └── ...
#     ├── Patient #2
#     │   └── ...
#     └── Patient #k
# 
#
### Usage
#Run `reorganize_multiprocessing.py` with the following options:
#
#- **-i, --InPath**: Specify the input folder containing patient subfolders with DICOM files.
#- **-o, --OutPath**: Specify the output folder to save reorganized files. Required if `inplace` is `False`.
#- **--inplace**: If `True`, reorganizes files in the input folder directly without creating a new output folder.
#- **--log**: Specify the path to a log file to save output details.
#- **--new_log**: Overwrite an existing log file if one exists with the same name.
#- **-v, --verbose**: Enable verbose output for detailed processing information.
#- **--no-segmentation**: Specify if the input folder does not contain segmentation files (default: `False`).
#- **--all-segmentation**: Indicate that all subfolders contain segmentation files (default: `False`).
#- **-S, --skip**: Provide a file with a list of filenames to exclude from reorganization.
#- **--include**: Provide a file with a list of filenames to include in reorganization (all files included by default).
#- **-j, --n_jobs**: Set the number of CPU cores to use for parallel processing.
#
### Example Command
#
#To reorganize a folder with subfolders for each patient and save the result in a new directory:
#
#```bash
#python reorganize_multiprocessing.py -i /path/to/input_folder -o /path/to/output_folder --inplace=False --log /path/to/logfile.log
#Usage: reorganize_multiprocessing.py --InPath --OutPath --inplace=False
#Help: reorganize_multiprocessing.py  -h

import multiprocessing
import sys, getopt, os
import glob
from tqdm import tqdm
import shutil
import re
from datetime import datetime
from utils import hprint_msg_box
from utils import hprint

[docs] def main(argv): inpath = '' outpath = '' inplace = False verbose = False log = '' new_log = False n_jobs = 1 skip_file_name='' include_file_name='' skip_files=[] include_files=[] NoSegmentation = False AllSegmentation = False try: opts, args = getopt.getopt(argv, "hi:o:vj:S:",["log=","new_log","InPath=","OutPath=","verbose","inplace","help","n_jobs=","skip=","include=","no-segmentation","all-segmentation"]) except getopt.GetoptError: print('reorganize.py -i <inputfolder> -o <outputfolder> --inplace=False',flush=True) sys.exit(2) for opt,arg in opts: if opt in ("-h", "--help"): print("NAME",flush=True) print("\treorgnize\n",flush=True) print("SYNOPSIS",flush=True) print("\treorgnize.py [-h|--help][-v|--verbose][-i|--InPath <inputfolder>][-o|--OutPath <outfolder>][--inplace <bool>\n",flush=True) print("DESRIPTION",flush=True) print("\tReorganize folder with DICOM to work with structure expected for dcm2nii.py\n",flush=True) print("OPTIONS",flush=True) print("\t -h, --help: print this help page",flush=True) print("\t -v, --verbose: False by default",flush=True) print("\t -i, --InPath: input folder with Dicom images",flush=True) print("\t -o, --OutPath: output folder to save the organized files (if inplace=False)",flush=True) print("\t --inplace: if True use the input folder to regorganize patients' folders",flush=True) print("\t, --no-segmentation: input folder do not contains segmentations (default: False)",flush=True) print("\t, --all-segmentation: input folder contains segmentations for all data (default False)",flush=True) print("\t -S, --skip: path to file with filenames to skip",flush=True) print("\t --include: path to file with filenames to include (all files included by default)",flush=True) print("\t --log: stdout redirect to log file",flush=True) print("\t --new_log: overwrite previous log file") sys.exit() elif opt in ("-i", "--InPath"): inpath = arg elif opt in ("-o", "--OutPath"): outpath = arg elif opt in ("-v", "--verbose"): verbose = True elif opt in ("--inplace"): inplace = True elif opt in ("--log"): log= arg elif opt in ("--new_log"): new_log= True elif opt in ("-S","--skip"): skip_file_name= arg elif opt in ("--include"): include_file_name= arg elif opt in ("-j", "--n_jobs"): n_jobs= int(arg) elif opt in ("--no-segmentation"): NoSegmentation = True elif opt in ("--all-segmentation"): AllSegmentation = True if log != '': if new_log: f = open(log,'w+') else: f = open(log,'a+') sys.stdout = f if skip_file_name != '': try: file= open(skip_file_name, 'r') skip_files = file.read().splitlines() except: print("\033[31mERROR! Unable to read the skip file\033[0m",flush=True) if include_file_name != '': try: file= open(include_file_name, 'r') include_files = file.read().splitlines() except: print("\033[31mERROR! Unable to read the include file\033[0m",flush=True) if verbose: msg = ( f"Input path: {inpath}\n" f"Output path: {outpath}\n" f"Inplace: {inplace}\n" f"n_jobs: {n_jobs}\n" f"Skip file: {skip_file_name}\n" f"Files to skip: {skip_files}\n" f"Include file: {include_file_name}\n" f"Files to include: {include_files}\n" f"Log: {log}\n" f"Overwrite previous log file: {str(new_log)}\n" f"No segmentation: {NoSegmentation}\n" f"All data segmented: {AllSegmentation}\n" f"Verbose: {verbose}\n" ) hprint_msg_box(msg=msg, indent=2, title=f"REORGANIZE {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") if not inplace and (inpath == outpath): print("ERROR! : input and output path must be different if inplace is not set to True",flush=True) sys.exit() else: if inplace: outpath = inpath if n_jobs == 1: for patient in tqdm(glob.glob(inpath+"/*"), ncols=100, desc="Reoganize", bar_format="{l_bar}{bar} [time left: {remaining}, time spent: {elapsed}]", colour="yellow"): reorganize(patient,inpath, outpath,inplace,skip_files,include_files,verbose,log, NoSegmentation,AllSegmentation) else: with multiprocessing.Pool(n_jobs) as pool: tqdm(pool.starmap(reorganize, [(patient,inpath, outpath,inplace,skip_files,include_files,verbose,log, NoSegmentation,AllSegmentation) for patient in glob.glob(inpath+"/*")]), ncols=100, desc="Reorganize", bar_format="{l_bar}{bar} [time left: {remaining}, time spent: {elapsed}]", colour="yellow")
[docs] def reorganize(patient,inpath, outpath,inplace,skip_files,include_files,verbose,log, NoSegmentation,AllSegmentation): if log != '': f = open(log,'a+') sys.stdout = f patientID=os.path.basename(patient) if len(include_files) > 0: #if file to include are specify if patientID not in include_files: #if patient is to be excluded if verbose: print("\n"+patientID+" ("+patient+") is not in the list of patients to include",flush=True) return if len(skip_files) > 0: #if there are files to skip if patientID in skip_files: if verbose: print("\nskip "+patientID+" ("+patient+")",flush=True) return #make directory for patient if output folder is different from input folder if not os.path.exists(os.path.join(outpath,patientID)): os.makedirs(os.path.join(outpath,patientID)) if verbose: hprint(f"Processing {patientID}", patient) for patient_subdirectory in glob.glob(patient+"/*"): subdirectory=os.path.basename(patient_subdirectory) if os.path.isdir(patient_subdirectory): #sub directory is a folder not a file if verbose: hprint("Subdirectory", patient_subdirectory) #make directory for subfolder if output folder is different from input folder if not os.path.exists(os.path.join(outpath,patientID,subdirectory)): os.makedirs(os.path.join(outpath,patientID,subdirectory)) #make DCM subfolder if output folder is different from input folder if not os.path.exists(os.path.join(outpath,patientID,subdirectory,"DCM")): os.makedirs(os.path.join(outpath,patientID,subdirectory,"DCM")) for file in tqdm(glob.glob(patient_subdirectory+"/*"), position=0, leave=True): if inplace: try: shutil.move(file,os.path.join(inpath,patientID,subdirectory,"DCM")) except: if os.path.isfile(file): #don't print the warning for a folder print("\033[33mWARNING! the file "+file+" was not moved to DCM folder\033[0m",flush=True) else: try: shutil.copy(file,os.path.join(outpath,patientID,subdirectory,"DCM")) except: print("\033[31mWARNING! the file "+file+" was not copied\033[0m") if not NoSegmentation: #Process RTSTRUCT only if data are segmented rtstruct_list=[f for f in os.listdir(patient) if os.path.isfile(os.path.join(patient,f))] #select index that match the name of the current subdirectory rtstruct_list_idx=[i for i, item in enumerate (rtstruct_list) if re.search(subdirectory,item, re.IGNORECASE)] if len(rtstruct_list_idx) == 0: if AllSegmentation: #all data need to be segemented print("\033[31mERROR! : RTSTUCT not found for the current subdirectory\033[0m",flush=True) sys.exit() else: print("\033[31mWARNING! : No RTSTRUCT found for data "+file,"\033[0m",flush=True) elif len(rtstruct_list_idx) > 1: print("\033[31mERROR! : multiple RTSTRUCTs found for the current subdirectory\033[0m",flush=True) sys.exit() else: rtstruct_name=rtstruct_list[rtstruct_list_idx[0]] shutil.copy(os.path.join(inpath,patientID,rtstruct_name),os.path.join(outpath,patientID,subdirectory)) os.rename(os.path.join(outpath,patientID,subdirectory,rtstruct_name),os.path.join(outpath,patientID,subdirectory,"RTSTRUCT.dcm")) if inplace: os.remove(os.path.join(outpath,patientID,rtstruct_name))
if __name__ == "__main__": main(sys.argv[1:])