Skip to content

1. Region-Baed CNN

1. RCNN Implementation

Below is an example of using a basic R-CNN (Region-based Convolutional Neural Network) with PyTorch. This example demonstrates how to use a pre-trained R-CNN model, specifically torchvision.models.detection.fasterrcnn_resnet50_fpn for demonstration purposes, as the basic R-CNN functionality is embedded in more advanced versions like Faster R-CNN.

Sample Code for R-CNN using PyTorch

import torch
import torchvision.transforms as T
from torchvision.models.detection import rcnn, fasterrcnn_resnet50_fpn
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.models.detection.roi_heads import RoIHeads
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
# Define a basic R-CNN model
def create_rcnn_model(num_classes):
# Load a pre-trained ResNet50 backbone
backbone = torchvision.models.resnet50(pretrained=True)
backbone = torch.nn.Sequential(*list(backbone.children())[:-2]) # Remove the final fully connected layers
# Define the RPN
anchor_generator = AnchorGenerator(
sizes=((32, 64, 128, 256, 512),),
aspect_ratios=((0.5, 1.0, 2.0),) * 5
)
# Define RoI heads
roi_heads = RoIHeads(
box_roi_pool=roi_pool,
box_head=box_head,
box_predictor=box_predictor
)
# Create the R-CNN model
model = rcnn.RoIAlign(
output_size=(7, 7),
spatial_scale=1.0/16,
sampling_ratio=2
)
return model
# Define the transformations
def transform_image(image_path):
image = Image.open(image_path).convert("RGB")
transform = T.Compose([
T.ToTensor(), # Convert image to tensor
])
return transform(image)
# Perform object detection
def detect_objects(image_path):
model = create_rcnn_model(num_classes=91) # 91 is for COCO dataset
model.eval() # Set the model to evaluation mode
image_tensor = transform_image(image_path)
with torch.no_grad():
predictions = model([image_tensor])
return predictions[0]
# Draw bounding boxes on the image
def draw_boxes(image_path, predictions):
image = Image.open(image_path).convert("RGB")
image_tensor = transform_image(image_path)
# Extract bounding boxes, labels, and scores from predictions
boxes = predictions['boxes']
labels = predictions['labels']
scores = predictions['scores']
# Filter out boxes with scores below a threshold (e.g., 0.5)
threshold = 0.5
keep = scores >= threshold
boxes = boxes[keep].cpu().numpy()
labels = labels[keep].cpu().numpy()
# Convert image tensor to numpy array for drawing
image_np = image_tensor.permute(1, 2, 0).numpy()
image_np = (image_np * 255).astype(np.uint8)
# Draw bounding boxes
fig, ax = plt.subplots(1, figsize=(12, 9))
ax.imshow(image_np)
for box, label, score in zip(boxes, labels, scores):
x, y, w, h = box
rect = plt.Rectangle((x, y), w - x, h - y, linewidth=2, edgecolor='red', facecolor='none')
ax.add_patch(rect)
ax.text(x, y, f'{int(label)}: {score:.2f}', bbox=dict(facecolor='yellow', alpha=0.5), fontsize=12, color='black')
plt.axis('off')
plt.show()
# Example usage
image_path = 'path_to_your_image.jpg'
predictions = detect_objects(image_path)
draw_boxes(image_path, predictions)

Explanation:

  1. Define the R-CNN Model:

    def create_rcnn_model(num_classes):
    backbone = torchvision.models.resnet50(pretrained=True)
    backbone = torch.nn.Sequential(*list(backbone.children())[:-2])
    anchor_generator = AnchorGenerator(
    sizes=((32, 64, 128, 256, 512),),
    aspect_ratios=((0.5, 1.0, 2.0),) * 5
    )
    roi_heads = RoIHeads(
    box_roi_pool=roi_pool,
    box_head=box_head,
    box_predictor=box_predictor
    )
    model = rcnn.RoIAlign(
    output_size=(7, 7),
    spatial_scale=1.0/16,
    sampling_ratio=2
    )
    return model
    • Uses a ResNet-50 backbone for feature extraction.
    • Configures Region Proposal Network (RPN) and RoI heads for object detection.
  2. Image Transformation:

    def transform_image(image_path):
    image = Image.open(image_path).convert("RGB")
    transform = T.Compose([T.ToTensor()])
    return transform(image)
  3. Object Detection:

    def detect_objects(image_path):
    model = create_rcnn_model(num_classes=91)
    model.eval()
    image_tensor = transform_image(image_path)
    with torch.no_grad():
    predictions = model([image_tensor])
    return predictions[0]
  4. Drawing Bounding Boxes:

    def draw_boxes(image_path, predictions):
    image = Image.open(image_path).convert("RGB")
    image_tensor = transform_image(image_path)
    boxes = predictions['boxes']
    labels = predictions['labels']
    scores = predictions['scores']
    threshold = 0.5
    keep = scores >= threshold
    boxes = boxes[keep].cpu().numpy()
    labels = labels[keep].cpu().numpy()
    image_np = image_tensor.permute(1, 2, 0).numpy()
    image_np = (image_np * 255).astype(np.uint8)
    fig, ax = plt.subplots(1, figsize=(12, 9))
    ax.imshow(image_np)
    for box, label, score in zip(boxes, labels, scores):
    x, y, w, h = box
    rect = plt.Rectangle((x, y), w - x, h - y, linewidth=2, edgecolor='red', facecolor='none')
    ax.add_patch(rect)
    ax.text(x, y, f'{int(label)}: {score:.2f}', bbox=dict(facecolor='yellow', alpha=0.5), fontsize=12, color='black')
    plt.axis('off')
    plt.show()

2. Faster R-CNN Implementation

This example demonstrates how to use a pre-trained Faster R-CNN model for inference on an image. It includes loading the model, processing the image, and displaying the results with bounding boxes.

2.1 Sample Code for Faster R-CNN using PyTorch

import torch
import torchvision.transforms as T
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
# Load a pre-trained Faster R-CNN model
model = fasterrcnn_resnet50_fpn(pretrained=True)
model.eval() # Set the model to evaluation mode
# Define a function to transform the input image
def transform_image(image_path):
image = Image.open(image_path).convert("RGB")
transform = T.Compose([
T.ToTensor(), # Convert image to tensor
])
return transform(image)
# Perform object detection
def detect_objects(image_path):
image_tensor = transform_image(image_path)
with torch.no_grad():
predictions = model([image_tensor])
return predictions[0]
# Draw bounding boxes on the image
def draw_boxes(image_path, predictions):
image = Image.open(image_path).convert("RGB")
image_tensor = transform_image(image_path)
# Extract bounding boxes, labels, and scores from predictions
boxes = predictions['boxes']
labels = predictions['labels']
scores = predictions['scores']
# Filter out boxes with scores below a threshold (e.g., 0.5)
threshold = 0.5
keep = scores >= threshold
boxes = boxes[keep].cpu().numpy()
labels = labels[keep].cpu().numpy()
# Convert image tensor to numpy array for drawing
image_np = image_tensor.permute(1, 2, 0).numpy()
image_np = (image_np * 255).astype(np.uint8)
# Draw bounding boxes
fig, ax = plt.subplots(1, figsize=(12, 9))
ax.imshow(image_np)
for box, label, score in zip(boxes, labels, scores):
x, y, w, h = box
rect = plt.Rectangle((x, y), w - x, h - y, linewidth=2, edgecolor='red', facecolor='none')
ax.add_patch(rect)
ax.text(x, y, f'{int(label)}: {score:.2f}', bbox=dict(facecolor='yellow', alpha=0.5), fontsize=12, color='black')
plt.axis('off')
plt.show()
# Example usage
image_path = 'path_to_your_image.jpg'
predictions = detect_objects(image_path)
draw_boxes(image_path, predictions)

2.2 Explanation:

  1. Loading the Model:

    model = fasterrcnn_resnet50_fpn(pretrained=True)
    model.eval() # Set the model to evaluation mode
    • Loads a pre-trained Faster R-CNN model with a ResNet-50 backbone and Feature Pyramid Network (FPN).
  2. Image Transformation:

    def transform_image(image_path):
    image = Image.open(image_path).convert("RGB")
    transform = T.Compose([T.ToTensor()])
    return transform(image)
    • Converts the image to a tensor suitable for model input.
  3. Object Detection:

    def detect_objects(image_path):
    image_tensor = transform_image(image_path)
    with torch.no_grad():
    predictions = model([image_tensor])
    return predictions[0]
    • Passes the image through the model to get predictions (bounding boxes, labels, scores).
  4. Drawing Bounding Boxes:

    def draw_boxes(image_path, predictions):
    image = Image.open(image_path).convert("RGB")
    image_tensor = transform_image(image_path)
    # Extract and filter predictions
    boxes = predictions['boxes']
    labels = predictions['labels']
    scores = predictions['scores']
    threshold = 0.5
    keep = scores >= threshold
    boxes = boxes[keep].cpu().numpy()
    labels = labels[keep].cpu().numpy()
    # Convert image tensor to numpy array
    image_np = image_tensor.permute(1, 2, 0).numpy()
    image_np = (image_np * 255).astype(np.uint8)
    # Plot results
    fig, ax = plt.subplots(1, figsize=(12, 9))
    ax.imshow(image_np)
    for box, label, score in zip(boxes, labels, scores):
    x, y, w, h = box
    rect = plt.Rectangle((x, y), w - x, h - y, linewidth=2, edgecolor='red', facecolor='none')
    ax.add_patch(rect)
    ax.text(x, y, f'{int(label)}: {score:.2f}', bbox=dict(facecolor='yellow', alpha=0.5), fontsize=12, color='black')
    plt.axis('off')
    plt.show()
    • Draws bounding boxes on the image and displays the results with matplotlib.

2.3 Notes:

  • Threshold: You can adjust the threshold for filtering low-confidence detections.
  • Labels: The labels are indices of the classes. To map them to actual class names, you need the COCO class names or any other dataset-specific labels.

This code demonstrates a basic implementation of Faster R-CNN for inference and visualization. For training and more complex tasks, you would need to handle dataset preparation, custom training loops, and evaluation metrics.