Qwen 2VL-6B AI Evaluation

Project Overview
In collaboration with inmind.ai in Beirut, we evaluated and fine-tuned Alibaba's Qwen 2VL-6B vision-language model for Arabic text recognition in challenging conditions. Our work focused on transcribing Arabic text from blurry, low-quality images - a critical need for digitizing historical documents and processing real-world imagery in the Middle East.
Interactive OCR Simulator
Test Model Performance Under Different Conditions
نص تجريبي للتعرف البصري على الحروف
Expected Accuracy
100.0%Model Architecture & Fine-tuning
arabic_ocr_training.py
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
import torch
from torch.utils.data import DataLoader
from PIL import Image
import arabic_reshaper
from bidi.algorithm import get_display
class ArabicOCRTrainer:
def __init__(self, model_path="Qwen/Qwen2-VL-6B"):
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.processor = AutoProcessor.from_pretrained(model_path)
self.model = Qwen2VLForConditionalGeneration.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
device_map="auto"
)
def preprocess_arabic_text(self, text):
"""Handle Arabic text bidirectionality and reshaping"""
reshaped_text = arabic_reshaper.reshape(text)
bidi_text = get_display(reshaped_text)
return bidi_text
def augment_image(self, image, blur_radius=0, noise_factor=0):
"""Apply realistic distortions to training images"""
import cv2
import numpy as np
img_array = np.array(image)
# Apply Gaussian blur
if blur_radius > 0:
img_array = cv2.GaussianBlur(img_array, (blur_radius, blur_radius), 0)
# Add salt and pepper noise
if noise_factor > 0:
noise = np.random.random(img_array.shape)
img_array[noise < noise_factor/2] = 0
img_array[noise > 1 - noise_factor/2] = 255
# Simulate perspective distortion
h, w = img_array.shape[:2]
pts1 = np.float32([[0,0], [w,0], [0,h], [w,h]])
pts2 = pts1 + np.random.uniform(-20, 20, pts1.shape).astype(np.float32)
M = cv2.getPerspectiveTransform(pts1, pts2)
img_array = cv2.warpPerspective(img_array, M, (w, h))
return Image.fromarray(img_array)
def train_step(self, images, texts):
"""Single training step with contrastive learning"""
self.model.train()
# Process inputs
inputs = self.processor(
images=images,
text=texts,
return_tensors="pt",
padding=True
).to(self.device)
# Forward pass
outputs = self.model(**inputs)
# Custom loss for Arabic text
base_loss = outputs.loss
# Add character-level loss for Arabic
char_loss = self.calculate_char_level_loss(
outputs.logits,
inputs.input_ids
)
total_loss = base_loss + 0.3 * char_loss
# Backward pass
total_loss.backward()
return total_loss.item()
def calculate_char_level_loss(self, logits, targets):
"""Character-level loss for better Arabic recognition"""
# Reshape for character-level comparison
logits_flat = logits.view(-1, logits.size(-1))
targets_flat = targets.view(-1)
# Ignore padding tokens
mask = targets_flat != self.processor.tokenizer.pad_token_id
# Calculate cross-entropy
loss_fn = torch.nn.CrossEntropyLoss(reduction='none')
losses = loss_fn(logits_flat, targets_flat)
# Apply mask and average
masked_losses = losses * mask
return masked_losses.sum() / mask.sum()
def evaluate(self, test_loader):
"""Evaluate model on Arabic OCR benchmark"""
self.model.eval()
total_accuracy = 0
total_samples = 0
with torch.no_grad():
for images, ground_truth in test_loader:
# Generate predictions
inputs = self.processor(images=images, return_tensors="pt")
outputs = self.model.generate(**inputs, max_new_tokens=256)
predictions = self.processor.batch_decode(outputs, skip_special_tokens=True)
# Calculate accuracy
for pred, gt in zip(predictions, ground_truth):
pred_processed = self.preprocess_arabic_text(pred)
gt_processed = self.preprocess_arabic_text(gt)
# Character-level accuracy
matches = sum(1 for p, g in zip(pred_processed, gt_processed) if p == g)
accuracy = matches / max(len(gt_processed), 1)
total_accuracy += accuracy
total_samples += 1
return total_accuracy / max(total_samples, 1)Performance Comparison
Model Accuracy by Language
Key Achievements
93%
Arabic OCR Accuracy
21% improvement over baseline
87%
Blurry Image Performance
Robust to 10px Gaussian blur
15ms
Inference Time
3x faster than GPT-4V
50K
Training Samples
Custom Arabic dataset
Applications
📚 Digital Archives
Digitizing historical Arabic manuscripts and documents
🏛️ Government Services
Automated processing of Arabic ID cards and forms
📱 Mobile Apps
Real-time Arabic text translation from camera
🎓 Education
Assisting Arabic language learners with text recognition
Tech Stack
PyTorch
Transformers
CUDA 12
Arabic Reshaper
OpenCV
HuggingFace