Source code for dlordinal.datasets.fgnet

import re
import shutil
from pathlib import Path
from typing import Any, Callable, List, Optional, Tuple, Union

import numpy as np
import pandas as pd
from PIL import Image
from skimage.io import imread, imsave
from skimage.transform import resize
from skimage.util import img_as_ubyte
from sklearn.model_selection import train_test_split
from torchvision.datasets.utils import download_and_extract_archive
from torchvision.datasets.vision import VisionDataset
from tqdm import tqdm


[docs] class FGNet(VisionDataset): """ Base class for FGNet dataset. Attributes ---------- root : Path Root directory of the dataset. target_size : tuple Size of the images after resizing. categories : list List of categories to be used. test_size : float Size of the test set. validation_size : float Size of the validation set. transform : callable, optional A function/transform that takes in a PIL image and returns a transformed version. target_transform : callable, optional A function/transform that takes in the target and transforms it. data : pd.DataFrame Dataframe containing the dataset. Parameters ---------- root : str or Path Root directory of the dataset. download : bool, optional, default = True If True, downloads the dataset from the internet and puts it in the root directory. If the dataset is already downloaded, it is not downloaded again. target_size : tuple, optional Size of the images after resizing. Default is (128, 128). categories : list, optional List of categories to be used. Default is [3, 11, 16, 24, 40]. test_size : float, optional Size of the test set. Default is 0.2. validation_size : float, optional Size of the validation set. Default is 0.15. train : bool, optional If True, returns the training dataset, otherwise returns the test dataset. Default is True. transform : callable, optional A function/transform that takes in a PIL image and returns a transformed version. target_transform : callable, optional A function/transform that takes in the target and transforms it. """ # Attributes root: Path target_size: tuple categories: list test_size: float validation_size: float transform: Optional[Callable] target_transform: Optional[Callable] data: pd.DataFrame def __init__( self, root: Union[str, Path], download: bool = True, target_size: tuple = (128, 128), categories: list = [3, 11, 16, 24, 40], test_size: float = 0.2, validation_size: float = 0.15, train: bool = True, transform: Optional[Callable] = None, target_transform: Optional[Callable] = None, ) -> None: super(FGNet, self).__init__( str(root), transform=transform, target_transform=target_transform ) self.root = Path(root) self.root.parent.mkdir(parents=True, exist_ok=True) self.target_size = target_size self.categories = categories self.test_size = test_size self.validation_size = validation_size self.transform = transform self.target_transform = target_transform original_path = self.root / "FGNET/images" processed_path = self.root / "FGNET/data_processed" original_csv_path = self.root / "FGNET/data_processed/fgnet.csv" train_csv_path = self.root / "FGNET/data_processed/train.csv" test_csv_path = self.root / "FGNET/data_processed/test.csv" original_images_path = self.root / "FGNET/data_processed" train_images_path = self.root / "FGNET/train" test_images_path = self.root / "FGNET/test" if download: self.download() if not self._check_integrity_download(): raise RuntimeError( "Dataset not found or corrupted. You can use download=True to" " download it" ) self.process(original_path, processed_path) self.split( original_csv_path, train_csv_path, test_csv_path, original_images_path, train_images_path, test_images_path, ) # Load train and test dataframes if train: self.data = pd.read_csv(train_csv_path) else: self.data = pd.read_csv(test_csv_path) def __str__(self) -> str: return "FGNet" def __len__(self) -> int: """ Obtain the number of samples in the dataset. Returns ------- int Number of samples in the dataset. """ return len(self.data) def __getitem__(self, index: int) -> Tuple[Any, Any]: """ Get a sample from the dataset. Parameters ---------- index : int Index of the sample to get. Returns ------- tuple (image, target) where target is the class index of the target class. """ img_path = ( self.root / "FGNET" / "data_processed" / self.data.iloc[index]["path"] ) # Cargar la imagen como PIL.Image.Image image = Image.open(img_path) image = image.convert("RGB") # Aplicar transformación si está definida if self.transform: image = self.transform(image) target = int(self.data.iloc[index]["category"]) if self.target_transform: target = self.target_transform(target) return image, target @property def targets(self) -> List[int]: """ Return the targets of the dataset. Returns ------- list List of targets. """ if self.target_transform: return self.target_transform(self.data["category"]) else: return self.data["category"].tolist() @property def classes(self) -> List[int]: """ Return the unique classes in the dataset. Returns ------- list List of unique classes. """ return np.unique(self.data["category"]).tolist()
[docs] def download(self) -> None: """ Download the FGNet dataset and extract it. """ if self._check_integrity_download(): print("Files already downloaded and verified") return download_and_extract_archive( "http://yanweifu.github.io/FG_NET_data/FGNET.zip", str(self.root), filename="fgnet.zip", md5="1206978cac3626321b84c22b24cc8d19", )
[docs] def process(self, original_path, processed_path): """ Process the FGNet dataset and save it in the processed_path. Parameters ---------- original_path : Path Path to the original dataset. processed_path : Path Path to save the processed dataset. """ if self._check_integrity_process(): print("Files already processed and verified") return data = self.load_data(original_path) df = pd.DataFrame(data, columns=["path", "category"]) processed_path.mkdir(parents=True, exist_ok=True) df.to_csv(processed_path / "fgnet.csv", index=False) self.process_images_from_df(df, original_path, processed_path) return df
[docs] def split( self, original_csv_path: Path, train_csv_path: Path, test_csv_path: Path, original_images_path: Path, train_images_path: Path, test_images_path: Path, ): """ Split the FGNet dataset into train and test sets. Parameters ---------- original_csv_path : Path Path to the original csv file. train_csv_path : Path Path to save the train csv file. test_csv_path : Path Path to save the test csv file. original_images_path : Path Path to the original images. train_images_path : Path Path to save the train images. test_images_path : Path Path to save the test images. """ if self._check_integrity_split(): print("Files already split and verified") return train, test = self.split_dataframe( original_csv_path, train_images_path, original_images_path, test_images_path ) test.to_csv(test_csv_path, index=False) train.to_csv(train_csv_path, index=False)
def _check_integrity_download(self) -> bool: """ Check if the FGNet dataset is downloaded and extracted. """ return (self.root / "FGNET").exists() def _check_integrity_process_split(self) -> bool: """ Check if the FGNet dataset is processed and split. """ return ( (self.root / "FGNET/data_processed").exists() and (self.root / "FGNET/trainval").exists() and (self.root / "FGNET/test").exists() ) def _check_integrity_process(self) -> bool: """ Check if the FGNet dataset is processed. """ return (self.root / "FGNET/data_processed").exists() def _check_integrity_split(self) -> bool: """ Check if the FGNet dataset is split. """ return (self.root / "FGNET/train").exists() and ( self.root / "FGNET/test" ).exists()
[docs] def get_age_from_filename(self, filename): """ Get the age from the filename. Parameters ---------- filename : str Filename of the image. """ m = re.match("[0-9]+A([0-9]+).*", filename) if m: return int(m.groups()[0]) return None
[docs] def find_category(self, real_age): """ Find the category of the real age. Parameters ---------- real_age : int Real age of the image. """ for i, age in enumerate(self.categories): if real_age < age: return i return len(self.categories)
[docs] def load_data(self, original_path: Path): """ Load the data from the original_path. Parameters ---------- original_path : Path Path to the original dataset. """ data = [] for img in original_path.iterdir(): age = self.get_age_from_filename(img.name) category = self.find_category(age) data.append([img.name, category]) return data
[docs] def process_images_from_df( self, df: pd.DataFrame, original_path: Path, processed_path: Path ): """ Process the images from the dataframe. Parameters ---------- df : pd.DataFrame Dataframe with the images. original_path : Path Path to the original dataset. processed_path : Path Path to save the processed dataset. """ for idx, row in tqdm( df.iterrows(), total=df.shape[0], desc="Processing images", unit="image" ): path = original_path / Path(row["path"]) processed_path_images = processed_path / Path(row["path"]) img = imread(path) img = img_as_ubyte(resize(img, self.target_size, anti_aliasing=True)) processed_path_images.parent.mkdir(parents=True, exist_ok=True) imsave(processed_path_images, img, check_contrast=False)
[docs] def split_dataframe( self, csv_path: Path, train_images_path: Path, original_images_path: Path, test_images_path: Path, ): """ Split the dataframe into train and test sets. Parameters ---------- csv_path : Path Path to the csv file. train_images_path : Path Path to save the train images. original_images_path : Path Path to the original images. test_images_path : Path Path to save the test images. """ df = pd.read_csv(csv_path) x = np.array(df["path"]) y = np.array(df["category"]) x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=self.test_size, random_state=1, stratify=y, ) for path, label in zip(x_train, y_train): train_path = train_images_path / str(label) train_path.mkdir(parents=True, exist_ok=True) shutil.copy(original_images_path / path, train_path / path) for path, label in zip(x_test, y_test): test_path = test_images_path / str(label) test_path.mkdir(parents=True, exist_ok=True) shutil.copy(original_images_path / path, test_path / path) x_train, x_val, y_train, y_val = train_test_split( x_train, y_train, test_size=self.validation_size, random_state=1, stratify=y_train, ) train = np.hstack((x_train[:, np.newaxis], y_train[:, np.newaxis])) val = np.hstack((x_val[:, np.newaxis], y_val[:, np.newaxis])) test = np.hstack((x_test[:, np.newaxis], y_test[:, np.newaxis])) trainval = np.vstack((train, val)) test_df = pd.DataFrame(data=test, columns=["path", "category"]) train_df = pd.DataFrame(data=trainval, columns=["path", "category"]) return train_df, test_df