MathEval/generate_ocr_custom.py at main · eth-lre/MathEval · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
"""
Script to generate OCR custom questions for math-eval dataset.

Usage:
    python generate_ocr_custom.py --input_dir <input_dir> --output_dir <output_dir> [other flags]
"""
import argparse
import os
from PIL import Image, ImageDraw, ImageFont
from tqdm import tqdm
import csv

def create_visual_equation(equation, font):
    elements = equation.split()
    images = []
    metadata = {}

    for element in elements:
        # Render each element as an image with larger dimensions
        element_image = Image.new('RGBA', (200, 200), (255, 255, 255, 0))  # Increased image size
        draw = ImageDraw.Draw(element_image)
        bbox = draw.textbbox((0, 0), element, font=font)
        w, h = bbox[2] - bbox[0], bbox[3] - bbox[1]
        draw.text(((200 - w) / 2, (200 - h) / 2), element, font=font, fill=(0, 0, 0))  # Centered text
        images.append(element_image)
        metadata[element] = element

    # Combine images side by side with REDUCED spacing
    widths, heights = zip(*(i.size for i in images))
    total_width = sum(widths) + (5 * (len(images) - 1))  # Reduced from 10 to 5
    max_height = max(heights)

    combined_image = Image.new('RGB', (total_width, max_height), color=(255, 255, 255))

    x_offset = 0
    for img in images:
        combined_image.paste(img, (x_offset, 0), img if img.mode == 'RGBA' else None)
        x_offset += img.width + 5  # Reduced from 10 to 5

    return combined_image, metadata

def main(equations_file, output_folder, metadata_file):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Load font with a fallback mechanism
    try:
        font = ImageFont.truetype("Arial.ttf", 120)
    except IOError:
        try:
            font = ImageFont.truetype("DejaVuSans.ttf", 120)
        except IOError:
            print("Warning: Using default font, might not render properly!")
            font = ImageFont.load_default()

    with open(equations_file, 'r') as file:
        lines = file.readlines()

    metadata_list = []

    for i, line in tqdm(enumerate(lines)):
        # Split the line to get equations and variable values
        parts = line.strip().split('<sep>')
        if len(parts) < 2:
            print(f"Skipping malformed line: {line}")
            continue

        equations_set = parts[0].strip()
        variable_values = parts[1].strip().split(',')

        # Dynamically extract variable names and values
        variables = {}
        for var in variable_values:
            if '=' in var:
                key, value = var.split('=')
                variables[key.strip()] = value.strip()

        equations = equations_set.split(',')
        equation_images = []

        for equation in equations:
            img, _ = create_visual_equation(equation.strip(), font)
            equation_images.append(img)

        # Combine images vertically
        widths, heights = zip(*(img.size for img in equation_images))
        max_width = max(widths)
        total_height = sum(heights) + (10 * (len(equation_images) - 1))

        combined_image = Image.new('RGB', (max_width, total_height), color=(255, 255, 255))

        y_offset = 0
        for img in equation_images:
            combined_image.paste(img, (0, y_offset))
            y_offset += img.height + 10

        output_path = os.path.join(output_folder, f"equation_set_{i + 1}.png")
        combined_image.save(output_path, dpi=(300, 300))

        # Store filename and all variables in metadata
        metadata_entry = {'filename': os.path.basename(output_path)}
        metadata_entry.update(variables)
        metadata_list.append(metadata_entry)

    # Write metadata to CSV file
    with open(metadata_file, 'w', newline='') as csvfile:
        # Dynamically determine fieldnames from metadata keys
        fieldnames = ['filename'] + list(variables.keys())
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for data in metadata_list:
            writer.writerow(data)


def cli_main():
    parser = argparse.ArgumentParser(description="Generate OCR custom questions for math-eval dataset.")
    parser.add_argument('--equations_file', type=str, required=True, help='Path to the equations file')
    parser.add_argument('--output_dir', type=str, required=True, help='Output directory for generated images and metadata')
    args = parser.parse_args()

    equations_file = args.equations_file
    output_folder = args.output_dir
    metadata_file = os.path.join(args.output_dir, "metadata.csv")
    main(equations_file, output_folder, metadata_file)

if __name__ == "__main__":
    cli_main()