DiffThinker/
├── DiffSynth-Studio/
│ ├── add/
│ │ ├── cmd/
│ │ │ ├── 2509.sh # train Qwen-Image-Edit-2509
│ │ │ └── 2511.sh # train Qwen-Image-Edit-2511
│ │ ├── infer/
│ │ │ ├── infer_with_middle
│ │ │ └── infer.py
│ │ └── merge_ckpt.py
│ ├── diffsynth/
│ └── ...
├── FrozenLake/
├── Maze/ # example
│ ├── 8_test/ # test
│ │ ├── 8_1_001.png
│ │ ├── 8_1_001_solution.png
│ │ ├── 8_1_001.txt # metadata
│ │ ├── ...
│ │ └── path.json # ground-truth
│ ├── 16_test/ # test
│ ├── 32_test/ # test
│ ├── eval/
│ │ ├── diffthinker.py # infer
│ │ ├── parse_image.py # parse
│ │ ├── eval_path.py # compare with ground-truth
│ │ ├── gen_and_parse.sh
│ │ └── eval_path.sh
│ ├── gen_image.py # generate dataset
│ ├── gen.txt # examples for gen_image.py
│ ├── gen_csv.py # generate metadata for training
│ ├── ...
├── TSP/
├── Sudoku/
└── Jigsaw/
git clone https://github.com/lcqysl/DiffThinker.git
cd DiffThinker/DiffSynth-Studio
pip install -e .
pip install gymnasium
# (Optional) Install vLLM for OCR tasks
# we recommend installing it in a SEPARATE environment to avoid conflicts.
# pip install vllm
We use Maze as an example to demonstrate the full pipeline: Data Preparation -> Training -> Inference -> Parsing -> Evaluation. First, download the base models.
cd Maze
# 1. Data Preparation
python gen_image.py --size 8 --num 2000 --min_len 1 --out ./8_train
python gen_csv.py --dir ./8_train
# Note: We recommend following the configurations in Maze/gen.txt
# to reproduce the difficulty levels used in our paper.
# 2. Training
cd ../DiffSynth-Studio
bash add/cmd/2509.sh
The test datasets used in our experiments is provided within each task's directory. We recommend using the same data to ensure the reproducibility of our results and to facilitate comparison with other models. If you wish to generate your own test data, please refer to the gen.txt file in each task directory.
cd Maze
# 1. Inference and Parsing
bash eval/gen_and_parse.sh
# 2. Evaluation
bash eval/eval_path.sh
# 3. Individual Inference
python ../DiffSynth-Studio/add/infer/infer.py
python ../DiffSynth-Studio/add/infer/infer_with_middle.py
If you find our work useful, please consider citing:
@article{he2025diffthinker,
title={DiffThinker: Towards Generative Multimodal Reasoning with Diffusion Models},
author={He, Zefeng and Qu, Xiaoye and Li, Yafu and Zhu, Tong and Huang, Siyuan and Cheng, Yu},
journal={arXiv preprint arXiv:2512.24165},
year={2025}
}
