import pandas as pd
import os

def split_excel(input_file, output_dir, max_size_mb=130):
    # Ensure the output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"Created directory: {output_dir}")

    # Read the CSV file in chunks
    chunk_size = 10  # Define the chunk size
    print(f"Reading CSV in chunks of {chunk_size} rows.")

    reader = pd.read_csv(input_file, chunksize=chunk_size)
    current_part = 1
    current_data = []

    for chunk in reader:
        # Append the current chunk to the list
        current_data.append(chunk)
        print(f"Added a chunk. Number of chunks collected: {len(current_data)}")

        # Write the current data to a temporary Excel file to check the size
        temp_file_path = os.path.join(output_dir, f'temp_part.xlsx')
        pd.concat(current_data).to_excel(temp_file_path, index=False)

        # Check the size of the temporary file
        current_size = os.path.getsize(temp_file_path) / 1024**2  # Size in MB
        print(f"Current temporary file size: {current_size:.2f} MB")

        if current_size > max_size_mb:
            if len(current_data) > 1:  # Ensure there's more than one chunk
                current_data.pop()  # Remove the last added chunk
                final_file_path = os.path.join(output_dir, f'part_{current_part}.xlsx')
                pd.concat(current_data).to_excel(final_file_path, index=False)
                print(f'Saved {final_file_path} - Size: {current_size:.2f} MB')

                # Start a new part
                current_part += 1
                current_data = [chunk]  # Start next file with the current chunk
                print(f"Starting new part: {current_part}")
            else:
                # If only one chunk causes overflow, save it directly and start new part
                final_file_path = os.path.join(output_dir, f'part_{current_part}.xlsx')
                chunk.to_excel(final_file_path, index=False)
                print(f'Saved {final_file_path} - Size exceeds limit but saved as is.')
                current_part += 1
                current_data = []  # Reset current data

    # Save any remaining data in the last part
    if current_data:
        final_file_path = os.path.join(output_dir, f'part_{current_part}.xlsx')
        pd.concat(current_data).to_excel(final_file_path, index=False)
        current_size = os.path.getsize(final_file_path) / 1024**2  # Size in MB
        print(f'Saved {final_file_path} - Final Size: {current_size:.2f} MB')

    # Clean up the temporary file
    if os.path.exists(temp_file_path):
        os.remove(temp_file_path)
        print("Cleaned up temporary files.")

# Example usage
input_csv_path = r'C:\Users\leo\Downloads\cleaned_image.csv'
output_directory = r'C:\Users\leo\Downloads\split_excel'
split_excel(input_csv_path, output_directory)