Source code for radiomics_multiprocessing

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
.. _RADIOMICS:

The **RADIOMICS** module enables the extraction of radiomic features using the PyRadiomics library (https://pyradiomics.readthedocs.io/en/latest/).

Options
-------

The **RADIOMICS** module can be used with the following options:

- ``verbose``: Enable or disable verbose mode.
- ``timer``: Enable or disable the timer to record execution time.
- ``inputFolder``: Path to the input folder containing images.
- ``outputFolder``: Path to the output folder where radiomics results will be saved.
- ``outputFolderSuffix``: Adds a suffix to the input folder name to create an output folder.
- ``log``: Path to a file for saving logs.
- ``new_log_file``: Create a new log file; if a file with the same name already exists, it will be overwritten.
- ``skip``: Path to a file listing subfolders inside the input folder to exclude from processing.
- ``multiprocessing``: Specify the number of cores to use for parallel processing.
- ``configs``: Path to a radiomics configuration file, which can contain multiple configurations for PyRadiomics. See details in the next section :ref:`Radiomics_configuration_file`.
- ``pyradiomics_config``: Specify a PyRadiomics configuration file. Use this instead of ``configs`` if multiple configurations are unnecessary or if using PyRadiomics-specific preprocessing options.
- ``image_filename``: Name of the image file used for radiomic analysis.
- ``mask_filename``: Name of the mask (segmentation) file used for radiomic analysis.
- ``radiomics_filename``: Name of the Excel file that will store radiomics results.
- ``stats_filename``: Name of an optional Excel file to store statistics on radiomic features. If not specified, this file will not be created.
- ``save_at_the_end``: Specify whether the Excel file should be created only after processing all patients. Disabled by default, so new lines are added to the Excel file after processing each patient.

Example Usage
-------------

Below is an example of how to use the **RADIOMICS** module:

.. code-block:: bash

    RADIOMICS:
    {
        inputFolder: /path/to/NIFTI_folder
        outputFolder: /path/to/radiomics_results
        image_filename: img_111.nii.gz
        mask_filename: msk_111.nii.gz
        radiomics_filename: radiomics.xlsx
        save_at_the_end: false
        configs: /path/to/radiomics_config_file
        log: /path/to/logs/radiomics.log
    }

In this example:

- **inputFolder**: Specifies the folder containing the NIfTI images for radiomics analysis.
- **outputFolder**: Directory where radiomics results are stored.
- **image_filename**: Indicates the name of the image to be analyzed.
- **mask_filename**: Name of the mask file associated with the image.
- **radiomics_filename**: Designates the Excel file that will hold radiomics features extracted by PyRadiomics.
- **save_at_the_end**: If set to `false`, entries are written to the Excel file immediately after processing each patient.
- **configs**: Specifies the configuration file for PyRadiomics, allowing customization of feature extraction.
- **log**: Provides the path to the log file for recording the processing details.
"""

# Extract Radiomics features
#
# This script uses the PyRadiomics library to extract radiomics features using one or multiple configurations.
# The configurations are specified in a config file (see documentation for more details).
# Gabor decomposition is performed by this script and not by PyRadiomics.
# Usage:
#     radiomics_multiprocessing.py -i <inputfolder> -o <outputfolder> [options]
#
#__author__ Florent Tixier
#__email__ tixier@jhu.edu
#
# Options:
#   -h, --help                        Show this help message and exit
#   -v, --verbose                     Enable verbose output (default: False)
#   -i, --inputFolder <inputfolder>   Input folder with images to analyze
#   -o, --outputFolder <outputfolder> Output folder to save the results (default: ~/)
#   -c, --config <configfile>         File with a list of radiomics configurations for PyRadiomics (see CONFIGS_EXAMPLE)
#   -p, --pyradiomics_config <file>   A PyRadiomics configuration file (use instead of --config if only one configuration is needed)
#   -I, --img_filename <filename>     Name of images to analyze in the folder (default: img.nii.gz)
#   -M, --msk_filename <filename>     Name of masks to analyze in the folder (default: msk.nii.gz)
#   -R, --radiomics_filename <filename> Name of the Excel file to save radiomics features (default: radiomics.xlsx)
#       --stats_filename <filename>   Name of the Excel file to save radiomics statistics (optional)
#   -x                                Save Excel file with radiomics after processing all patients
#   -S, --skip <skip file path>       Path to file with filenames to skip
#       --include <include file path> Path to file with filenames to include (default: include all)
#       --log <log file path>         Redirect stdout to a log file
#       --new_log                     Overwrite previous log file
#   -j, --n_jobs <number of jobs>     Number of simultaneous jobs (default: 1)
#
# Help:
#     radiomics_multiprocessing.py -h

import sys, getopt, os
from tqdm import tqdm
import glob
import logging
import pandas as pd
import math
from math import pi
import SimpleITK as sitk
import multiprocessing
from radiomics import featureextractor
import re
from datetime import datetime
from utils import eprint
from utils import hprint_msg_box
from utils import hprint
from utils import format_list_multiline

[docs] def main(argv): inpath = '' outpath = '~/' configFile = '' configs=[] #list of configs for the radiomics feature extraction pyrconfigFile = '' img_filename= "img.nii.gz" msk_filename="msk.nii.gz" radiomics_filename = 'radiomics.xlsx' verbose = False n_jobs = 1 skip_file_name='' skip_files=[] include_file_name='' include_files=[] save_xlsx_at_the_end= False features_df =pd.DataFrame() #dataframe for radiomics features log = '' new_log = False stats_filename= '' try: opts, args = getopt.getopt(argv, "vhi:o:c:p:j:I:M:R:S:x",["log=","new_log","verbose","skip=","include=","help","config=","pyradiomics_config=","inputFolder=","outFolder=","n_jobs=","img_filename=","msk_filename=","radiomics_filename=","stats_filename="]) except getopt.GetoptError: print('Usage: radiomics_multiprocessing.py -i <inputfolder> -o <outputfolder> -c <configfile>') print('For help, use: radiomics_multiprocessing.py -h') sys.exit(2) for opt,arg in opts: if opt in ("-h", "--help"): print("NAME") print("\tradiomics_multiprocessing.py\n") print("SYNOPSIS") print("\tradiomics_multiprocessing.py [-h|--help][-v|--verbose][-i|--inputFolder <inputfolder>] [-I|--img_filename <img_filename>] [-M|--msk_filename <msk_filename>] [-R|--radiomics_filename <radiomics_filename>] [--stats_filename <stats_filename>] [-o|--outFolder <outFolder>] -c <configfile> [-x] [-S|--skip <skip>] [--include <include>] [--log <log>] [-j|--n_jobs <n_jobs>]") print("DESRIPTION") print("\tExtract Radiomics features for patients in the input folder with configurations in the config file\n") print("OPTIONS") print("\t -h, --help: print this help page") print("\t -v, --verbose: False by default") print("\t -i, --inputFolder: input folder with images to analyze") print("\t -I, --img_filename: name of images to analyze in the folder (default: img.nii.gz)") print("\t -M, --msk_filename: name of images to analyze in the folder (default: msk.nii.gz)") print("\t -R, --radiomics_filename: Name of the Excel file to save radiomics features (default: radiomics.xlsx)") print("\t --stats_filename: Name of the Excel file to save radiomics features") print("\t -o, --outFolder: Output folder to save the results (default: ~/)") print("\t -c, --config: File with a list of radiomics configurations for pyradiomics (see CONFIGS_EXAMPLE)") print("\t -p, --pyradiomics_config: A pyradiomics configuration file (to use instead of --config if there is no need of multiple configurations or to use preprocessing options of pyradiomics)") print("\t -x: Save Excel file with radiomics after processing all the patients") print("\t -S, --skip: Path to file with filenames to skip") print("\t --include: Path to file with filenames to include (all files included by default)",flush=True) print("\t --log: redirect stdout to a log file") print("\t --new_log: overwrite previous log file") print("\t -j, --n_jobs: Number of simultaneous jobs (default:1)") sys.exit() elif opt in ("-i", "--inputFolder"): inpath = arg elif opt in ("-o", "--outputFolder"): outpath = arg elif opt in ("-I", "--img_filename"): img_filename = arg elif opt in ("-M", "--msk_filename"): msk_filename = arg elif opt in ("-R", "--radiomics_filename"): radiomics_filename = arg elif opt in ("--stats_filename"): stats_filename = arg elif opt in ("-v", "--verbose"): verbose = True elif opt in ("-x"): save_xlsx_at_the_end = True elif opt in ("-j", "--n_jobs"): n_jobs= int(arg) elif opt in ("-c","--config"): configFile = arg elif opt in ("-p","--pyradiomics_config"): pyrconfigFile = arg elif opt in ("-S","--skip"): skip_file_name= arg elif opt in ("--include"): include_file_name= arg elif opt in ("--log"): log= arg elif opt in ("--new_log"): new_log= True # set level for all classes logger = logging.getLogger("radiomics") logger.setLevel(logging.ERROR) if log != '': if new_log: f = open(log,'w+') else: f = open(log,'a+') sys.stdout = f if skip_file_name != '': try: file= open(skip_file_name, 'r') skip_files = file.read().splitlines() except: print("ERROR! Unable to read the skip file") if include_file_name != '': try: file= open(include_file_name, 'r') include_files = file.read().splitlines() except: print("ERROR! Unable to read the include file",flush=True) if verbose: msg = ( f"Input folder: {inpath}\n" f"Output folder: {outpath}\n" f"Images name: {img_filename}\n" f"Masks name: {msk_filename}\n" f"Configuration file: {configFile}\n" f"Pyradiomics Configuration file: {pyrconfigFile}\n" f"Radiomics Excel file name: {radiomics_filename}\n" f"Save Excel file at the end: {save_xlsx_at_the_end}\n" ) if stats_filename != '': msg += f"Save an additional Excel file with radiomics statistics in: {stats_filename}\n" msg += ( f"Skip file: {skip_file_name}\n" f"Files to skip: {format_list_multiline(skip_files,5)}\n" f"Include file: {include_file_name}\n" f"Files to include: {format_list_multiline(include_files,5)}\n" f"Verbose: {verbose}\n" f"n_jobs: {n_jobs}\n" f"Log: {log}\n" f"Overwrite previous log file: {str(new_log)}\n" ) hprint_msg_box(msg=msg, indent=2, title=f"RADIOMICS {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") #create outpath directory if needed if not os.path.exists(outpath): os.makedirs(outpath) if configFile != '': try: read_config_file(configFile,configs,verbose) if verbose: print("Config file",configFile, " was read with success",flush=True) except: print("\033[31mERROR! Invalid config file\033[0m",flush=True) sys.exit() elif pyrconfigFile == '': print("A RADIOMICS_CONFIGS or a pyradiomics configuration file need to be specify",flush=True) sys.exit() if n_jobs == 1: for patient in tqdm(glob.glob(inpath+"/*"), ncols=100, desc="Extract Radiomics", bar_format="{l_bar}{bar} [time left: {remaining}, time spent: {elapsed}]", colour="yellow"): features_df=pd.concat([features_df, extract_radiomics(patient,inpath,outpath,img_filename,msk_filename,configs,pyrconfigFile,features_df,radiomics_filename,save_xlsx_at_the_end,n_jobs,skip_files,include_files,verbose,log)], axis=0) if save_xlsx_at_the_end==True: features_df.to_excel(os.path.join(outpath,radiomics_filename),index=False) else: with multiprocessing.Pool(n_jobs) as pool: tqdm(pool.starmap(extract_radiomics, [(patient,inpath,outpath,img_filename,msk_filename,configs,pyrconfigFile,features_df,radiomics_filename,save_xlsx_at_the_end,n_jobs,skip_files,include_files,verbose,log) for patient in glob.glob(inpath+"/*")]), ncols=100, desc="Extract Radiomics", bar_format="{l_bar}{bar} [time left: {remaining}, time spent: {elapsed}]", colour="yellow") if merge_xlsx(outpath,radiomics_filename): #merge xslx file and delete temporary files (if success) deleteTmp_xlsx(outpath) if stats_filename != '': radiomics_statistics(os.path.join(outpath,radiomics_filename), os.path.join(outpath,stats_filename), verbose, log)
[docs] def extract_radiomics(patient,inpath,outpath,img_filename,msk_filename,configs,pyrconfigFile,features_df,radiomics_filename,save_xlsx_at_the_end,n_jobs,skip_files,include_files,verbose,log): if log != '': f = open(log,'a+') sys.stdout = f patientID=os.path.basename(patient) patient_features_df =pd.DataFrame() #radiomics features for 1 patients if len(include_files) > 0: #if file to include are specify if patientID not in include_files: #if patient is to be excluded if verbose: print("\n"+patientID+" ("+patient+") is not in the list of patients to include",flush=True) return if len(skip_files) > 0: #if there are files to skip if patientID in skip_files: if verbose: print("\nskip "+patientID+" ("+patient+")",flush=True) return if verbose: hprint(f"Processing {patientID}", patient) for patient_subdirectory in glob.glob(patient+"/*"): first_cfg=True #to extract diagnosis info from the first radiomics config subdirectory=os.path.basename(patient_subdirectory) features_subdir_df =pd.DataFrame() #radiomics features for 1 patients if verbose: print(patientID+": "+subdirectory,flush=True) try: img = sitk.ReadImage(os.path.join(patient_subdirectory,img_filename)) except: print("\033[31mERROR! Image ",os.path.join(patient_subdirectory,img_filename), " was not read\033[0m",flush=True) print("\033[31mSkipping image"+patientID+" "+subdirectory+"\033[0m",flush=True) eprint("Skipping "+patientID+" "+subdirectory+" (ERROR reading image)") continue try: msk = sitk.ReadImage(os.path.join(patient_subdirectory,msk_filename)) msk_array = sitk.GetArrayFromImage(msk) # Check if the mask is empty (all voxels are zero) if not msk_array.any(): print(f"\033[31mERROR! Mask {os.path.join(patient_subdirectory, msk_filename)} is empty (all voxels = 0)\033[0m", flush=True) print(f"\033[31mSkipping image {patientID} {subdirectory}\033[0m", flush=True) eprint(f"Skipping {patientID} {subdirectory} (ERROR empty mask)") continue except: print("\033[31mERROR! Mask ",os.path.join(patient_subdirectory,msk_filename), " was not read\033[0m",flush=True) print("\033[31mSkipping image"+patientID+" "+subdirectory+"\033[0m",flush=True) eprint("Skipping "+patientID+" "+subdirectory+" (ERROR reading mask)") continue if pyrconfigFile != '': #use a pyradiomics config file if verbose: print(patientID+": "+subdirectory+" ("+pyrconfigFile+")",flush=True) try: extractor= featureextractor.RadiomicsFeatureExtractor(pyrconfigFile) radiomics = extractor.execute(img, msk) except: print('\033[31mERROR radiomics feature extraction failed\033[0m',flush=True) try: features = {k: v for k, v in radiomics.items()} features_subdir_cfg_df = pd.DataFrame([features.values()], columns=features.keys()) features_subdir_cfg_df.insert(loc=0, column='sub_Analysis', value=subdirectory) features_subdir_cfg_df.insert(loc=0, column='patientID', value=patientID) features_subdir_df = pd.concat([features_subdir_df, features_subdir_cfg_df], axis=1) except: print("\033[31mERROR reading radiomics items\033[0m",flush=True) else: #use RADIOMICS_CONFIGS file for cfg in configs: if verbose: print(patientID+": "+subdirectory+" ("+cfg["configName"]+")",flush=True) params = {} for i in cfg.index: params[i]=parse(cfg[i]) if verbose: print(params,flush=True) if params['imageType'] == 'Original': try: extractor= featureextractor.RadiomicsFeatureExtractor(**params) radiomics = extractor.execute(img, msk) except: print('\033[31mERROR radiomics feature extraction failed\033[0m',flush=True) elif params['imageType'] == 'Gabor': try: params['imageType']='Original' #For gabor the convolution is done outside pyradiomics extractor= featureextractor.RadiomicsFeatureExtractor(**params) extractor.disableAllFeatures() extractor.enableFeatureClassByName('firstorder') extractor.enableFeatureClassByName('glcm') extractor.enableFeatureClassByName('gldm') extractor.enableFeatureClassByName('glrlm') extractor.enableFeatureClassByName('glszm') extractor.enableFeatureClassByName('ngtdm') boundingBox=featureextractor.imageoperations.checkMask(sitk.Cast(img, sitk.sitkInt64),sitk.Cast(msk, sitk.sitkInt64)) if not 'padDistance' in params.keys(): params['padDistance']=10 cropImg, cropMsk = featureextractor.imageoperations.cropToTumorMask(img, msk, boundingBox[0], padDistance=params['padDistance']) gabCropImg=gaborFilterImg(cropImg,params=params,ID=patientID+'_'+subdirectory,path=outpath) radiomics = extractor.execute(gabCropImg, cropMsk) except: print('\033[31mERROR radiomics feature extraction failed\033[0m',flush=True) else: try: extractor = featureextractor.RadiomicsFeatureExtractor(**params) extractor.disableAllImageTypes() extractor.enableImageTypeByName(params['imageType']) extractor.disableAllFeatures() extractor.enableFeatureClassByName('firstorder') extractor.enableFeatureClassByName('glcm') extractor.enableFeatureClassByName('gldm') extractor.enableFeatureClassByName('glrlm') extractor.enableFeatureClassByName('glszm') extractor.enableFeatureClassByName('ngtdm') radiomics = extractor.execute(img, msk) except: print('\033[31mERROR radiomics feature extraction failed\033[0m',flush=True) if first_cfg: #pull out diagnostic info from first configuration first_cfg=False try: diagnostics = {k: v for k, v in radiomics.items() if (k.startswith('diagnostics'))} diagnostics_df=pd.DataFrame([diagnostics.values()],columns=diagnostics.keys()) diagnostics_df.insert(loc=0,column='sub_Analysis',value=subdirectory) diagnostics_df.insert(loc=0,column='patientID',value=patientID) features_subdir_df= pd.concat([features_subdir_df,diagnostics_df], axis=1) except: print('\033[31mERROR reading diagnostic information\033[0m',flush=True) try: features = {k: v for k, v in radiomics.items() if not(k.startswith('diagnostics'))} prefix=next(iter(features.keys())).split('_')[0].split('-')[0] features = {k.removeprefix(prefix): v for k, v in features.items()} #remove original from feature name features ={f'{params["configName"]}{k}': v for k, v in features.items()} #add configName as a prefix to the features name features_subdir_cfg_df=pd.DataFrame([features.values()],columns=features.keys()) features_subdir_df= pd.concat([features_subdir_df,features_subdir_cfg_df], axis=1) except: print("\033[31mERROR reading radiomics items\033[0m",flush=True) if save_xlsx_at_the_end==False: if n_jobs == 1: #no multiprocessing try: if not os.path.exists(os.path.join(outpath,radiomics_filename)): features_subdir_df.to_excel(os.path.join(outpath,radiomics_filename),index=False) else: with pd.ExcelWriter(os.path.join(outpath,radiomics_filename), if_sheet_exists = 'overlay', mode='a') as writer: features_subdir_df.to_excel(writer,startrow=writer.sheets['Sheet1'].max_row,index=False, header=False) except: print("\033[31mERROR! patient "+patientID+" was not added in the excel file\033[0m",flush=True) else: try: features_subdir_df.to_excel(os.path.join(outpath,".tmp___"+patientID+"___"+subdirectory+"___"+radiomics_filename),index=False) except: print("\033[31mERROR! patient "+patientID+" was not added in the excel file ("+os.path.join(outpath,".tmp___"+patientID+"___"+subdirectory+"___"+radiomics_filename)+")\033[0m",flush=True) else: patient_features_df=pd.concat([patient_features_df, features_subdir_df], axis=0) #add subanalysis for the patient to feature_df hprint("Radiomics saved:", os.path.join(outpath,radiomics_filename)) return patient_features_df
#Take a string and try to parse it to a list, a float, a int or a bool
[docs] def parse(i): if i in ['True','true']: #Bool True return True elif i in ['False','false']: #Bool False return False else: try: #int return int(i) except: try: #float return float(i) except: if 'pi' in i: try: return eval(i) except: return i elif not i[0] == '[': #string return i else: try: #list split=i.split(',') split[0]=split[0].strip("[") split[len(split)-1]=split[len(split)-1].strip("]") for j in range(len(split)): split[j]=parse(split[j]) return split except: #string return i
[docs] def read_config_file(config_File,configs,verbose): config=pd.Series(dtype=object) with open(config_File, 'r') as infile: for raw_line in infile: line=raw_line.strip() if not line: if 'configName' and 'imageType' in config.index: #a config needs to contain at least a config name and an image type configs.append(config) if verbose: print("\033[1mThe following configuration was found in",config_File,"\033[0m",flush=True) print(config,flush=True) config=pd.Series(dtype=object) continue elif line[0]=='#': continue elif 'configName' in line: line=line.strip('configName:') line=line.replace(' ','') line=line.replace('\t','') config=pd.concat([config,pd.Series(line, index=["configName"])]) elif 'imageType' in line: line=line.strip('imageType:') line=line.replace(' ','') line=line.replace('\t','') config=pd.concat([config,pd.Series(line, index=["imageType"])]) else: line=line.replace(' ','') line=line.replace('\t','') config=pd.concat([config,pd.Series(line.split(':')[1], index=[line.split(':')[0]])]) if 'configName' and 'imageType' in config.index: #a config needs to contain at least a config name and an image type if verbose: print("\033[1mThe following configuration was found in",config_File,"\033[0m",flush=True) print(config,flush=True) configs.append(config)
#function for gabor filtering #This function should be used after cropping the image to reduce processing time
[docs] def gaborFilterImg(img,params,ID,path="~/"): #default values is missing parameters if not 'verbose' in params.keys(): params['verbose']=False if not 'save' in params.keys(): params['save']=False if not 'padDistance' in params.keys(): params['padDistance']=10 if not 'size' in params.keys(): params['size']=8 if not 'freq' in params.keys(): params['freq']=0.5 if not 'angle' in params.keys(): params['angle']=0 hsize=params['size']*0.5 vecDirection= (math.cos(params['angle']), -math.sin(params['angle']), math.sin(params['angle']), math.cos(params['angle'])) vecOrigin= (hsize-hsize*(math.cos(params['angle'])-math.sin(params['angle'])),hsize-hsize*(math.sin(params['angle'])+math.cos(params['angle']))) GaborKernel = sitk.GaborImageSource() GaborKernel.SetOutputPixelType(sitk.sitkFloat32) GaborKernel.SetDirection(vecDirection) GaborKernel.SetOrigin(vecOrigin) GaborKernel.SetSize([params['size']]*2) GaborKernel.SetSigma([params['size']*.2]*2) GaborKernel.SetMean([hsize]*2) GaborKernel.SetFrequency(params['freq']) KernelImg=GaborKernel.Execute() if params['save']: if not os.path.exists(os.path.join(path,ID)): os.makedirs(os.path.join(path,ID)) sitk.WriteImage(KernelImg, os.path.join(path,ID,"GaborKernel.nii.gz")) GaborFilter=sitk.ConvolutionImageFilter() img = sitk.Cast(img, sitk.sitkFloat32) GabImg = img if params['verbose']: for k in tqdm(range(img.GetSize()[2]), ncols = 100, desc="Apply gabor filter", bar_format="{l_bar}{bar} [ time left: {remaining}, time spent: {elapsed}]"): try: GabImg[:,:,k]=GaborFilter.Execute(img[:,:,k], KernelImg) except Exception as e: print(e,flush=True) else: for k in range(img.GetSize()[2]): try: GabImg[:,:,k]=GaborFilter.Execute(img[:,:,k], KernelImg) except Exception as e: print(e,flush=True) if params['save']: if not os.path.exists(os.path.join(path,ID)): os.makedirs(os.path.join(path,ID)) sitk.WriteImage(GabImg, os.path.join(path,ID,"Gabor_img.nii.gz")) return GabImg
#merge temporary excel file in one file and delete temporary files
[docs] def merge_xlsx(path,radiomics_filename): file_list = glob.glob(path+ "/.tmp___*.xlsx") try: excels=[pd.ExcelFile(name) for name in file_list] frames = [x.parse(x.sheet_names[0],header=None,incex_col=None) for x in excels] #remove headers row except for the first xlsx frames[1:] = [df[1:] for df in frames[1:]] combined=pd.concat(frames) except: print("\033[31mERROR! Excel files were not read correctly\033[0m",flush=True) try: combined.to_excel(os.path.join(path,radiomics_filename), header=False, index=False) print("Excel files were merged with success",flush=True) return 1 except: print("\033[31mERROR! Excel files were not merged\033[0m",flush=True) return -1
#Save radiomics statistics
[docs] def radiomics_statistics(xlsx_input_file, xlsx_output_file, verbose, log): if log != '': f = open(log, 'a+') sys.stdout = f try: df = pd.read_excel(xlsx_input_file) # Define columns to exclude exclude_pattern = r'^(patientID|sub_Analysis)|diagnostics' radiomics_columns = [col for col in df.columns if not re.match(exclude_pattern, col)] # Calculate statistics for the entire dataset column_stats = df[radiomics_columns].describe() statistic_names = column_stats.index.tolist() statistic_df = pd.DataFrame(columns=df.columns) statistic_df = statistic_df.rename(columns={'patientID': 'statistics'}) statistic_df['statistics'] = statistic_names statistic_df['sub_Analysis'] = 'all' # Set 'all' for the overall statistics statistic_df.index = column_stats.index statistic_df[radiomics_columns] = column_stats # Handle sub_Analysis-specific statistics unique_values = df['sub_Analysis'].unique() for value in unique_values: if verbose: print(f"Calculating statistics for sub_Analysis = {value}", flush=True) # Filter the dataframe by the specific sub_Analysis value subset_df = df[df['sub_Analysis'] == value] subset_stats = subset_df[radiomics_columns].describe() # Prepare a new dataframe for this subset's statistics subset_stat_df = pd.DataFrame(columns=df.columns) subset_stat_df = subset_stat_df.rename(columns={'patientID': 'statistics'}) subset_stat_df['statistics'] = statistic_names subset_stat_df['sub_Analysis'] = value # Set the current sub_Analysis value subset_stat_df.index = subset_stats.index subset_stat_df[radiomics_columns] = subset_stats # Append the subset statistics to the main statistics dataframe statistic_df = pd.concat([statistic_df, subset_stat_df], ignore_index=True) # Check if output file exists, and create a timestamped version if it does if os.path.exists(xlsx_output_file): timestamp = datetime.now().strftime('%Y%m%d%H%M%S') base_filename = os.path.basename(xlsx_output_file) filename, extension = os.path.splitext(base_filename) new_filename = f"{filename}_{timestamp}{extension}" if verbose: print("\033[33mWARNING!",xlsx_output_file, "already exists, results will be saved in", new_filename,"\033[0m", flush=True) xlsx_output_file = os.path.join(os.path.dirname(xlsx_output_file), new_filename) # Save the final statistics dataframe to Excel try: statistic_df.to_excel(xlsx_output_file, index=False) hprint("Radiomics statistics saved", xlsx_output_file) except: print("\033[31mERROR saving ", xlsx_output_file,"\033[0m", flush=True) except Exception as e: print("\033[31mERROR reading ", xlsx_input_file, ": ", str(e),"\033[0m", flush=True)
[docs] def deleteTmp_xlsx(path): file_list = glob.glob(path+ "/.tmp___*.xlsx") for name in file_list: os.remove(name)
if __name__ == "__main__": main(sys.argv[1:])