vision_unlearning.datasets
==========================

.. py:module:: vision_unlearning.datasets


Submodules
----------

.. toctree::
   :maxdepth: 1

   /autoapi/vision_unlearning/datasets/base/index
   /autoapi/vision_unlearning/datasets/cifar/index
   /autoapi/vision_unlearning/datasets/coco/index
   /autoapi/vision_unlearning/datasets/imagenette/index
   /autoapi/vision_unlearning/datasets/local/index
   /autoapi/vision_unlearning/datasets/others/index
   /autoapi/vision_unlearning/datasets/testbed/index


Attributes
----------

.. autoapisummary::

   vision_unlearning.datasets.logger
   vision_unlearning.datasets._type_task
   vision_unlearning.datasets._type_method
   vision_unlearning.datasets.task_to_dataset_map


Classes
-------

.. autoapisummary::

   vision_unlearning.datasets.GeneratedDataset


Functions
---------

.. autoapisummary::

   vision_unlearning.datasets.get_logger
   vision_unlearning.datasets.get_target_preprocessed
   vision_unlearning.datasets.get_target_overwrite
   vision_unlearning.datasets.get_metadata_filtered_path
   vision_unlearning.datasets.get_metadata_filtered
   vision_unlearning.datasets.save_metadata_filtered
   vision_unlearning.datasets.exists_metadata_filtered
   vision_unlearning.datasets.get_attribute_for_entity
   vision_unlearning.datasets.get_unlearned_model_folder
   vision_unlearning.datasets.exists_unlearned_model
   vision_unlearning.datasets.get_generated_dataset_folder
   vision_unlearning.datasets.get_generated_dataset_file
   vision_unlearning.datasets.exists_unlearned_dataset
   vision_unlearning.datasets.get_shared_baseline_folder
   vision_unlearning.datasets.get_off_image_path
   vision_unlearning.datasets.get_similarity_clip_path
   vision_unlearning.datasets.get_similarity_clip_df
   vision_unlearning.datasets.calculate_similarity_clip
   vision_unlearning.datasets.plot_heatmap


Package Contents
----------------

.. py:function:: get_logger(name: str, level=logging.INFO) -> logging.Logger

.. py:data:: logger

.. py:data:: _type_task

.. py:data:: _type_method

.. py:function:: get_target_preprocessed(task: Literal['scenes', 'objects', 'breeds', 'people'], target: str) -> str

.. py:function:: get_target_overwrite(task: Literal['scenes', 'objects', 'breeds', 'people'], method: Literal['munba', 'uce', 'distil'], target: str) -> Tuple[str, str]

   @return preprocessed target, target_overwrite


.. py:function:: get_metadata_filtered_path(task: Literal['scenes', 'objects', 'breeds', 'people'], base_folder: str = 'assets') -> str

.. py:function:: get_metadata_filtered(task: Literal['scenes', 'objects', 'breeds', 'people'], base_folder: str = 'assets') -> List[Dict[str, Any]]

.. py:function:: save_metadata_filtered(task: Literal['scenes', 'objects', 'breeds', 'people'], metadata_filtered: List[Dict[str, Any]], base_folder: str = 'assets')

.. py:function:: exists_metadata_filtered(task: Literal['scenes', 'objects', 'breeds', 'people'], base_folder: str = 'assets') -> bool

.. py:function:: get_attribute_for_entity(metadata_filtered: List[Dict[str, Any]], entity_name: str, attribute: str) -> Any

.. py:data:: task_to_dataset_map
   :type:  Dict[Literal['scenes', 'objects', 'breeds', 'people'], str]

.. py:function:: get_unlearned_model_folder(task: Literal['scenes', 'objects', 'breeds', 'people'], method: Literal['munba', 'uce', 'distil'], num_train_epochs: int, target: str, base_folder: str = 'assets') -> str

.. py:function:: exists_unlearned_model(task: Literal['scenes', 'objects', 'breeds', 'people'], method: Literal['munba', 'uce', 'distil'], num_train_epochs: int, target: str, base_folder: str = 'assets') -> bool

.. py:function:: get_generated_dataset_folder(task: Literal['scenes', 'objects', 'breeds', 'people'], method: Literal['munba', 'uce', 'distil'], num_train_epochs: int, target: str, base_folder: str = 'assets') -> str

.. py:function:: get_generated_dataset_file(lora_state: Literal['on', 'off'], seed: int, prompt: str) -> str

.. py:function:: exists_unlearned_dataset(generated_dataset_output_path: str, generate_dataset_seeds: List[int], prompts: List[str]) -> bool

   Return True if the entity dataset folder contains all expected on_* images.

   Only on_* (unlearned model) images are counted. off_* files that may exist in
   legacy entity folders (pre-baseline-refactor data) are ignored so that old datasets
   remain valid without requiring a re-generation pass.

   Baseline lora_state='off' images live in the shared baseline folder; see
   get_shared_baseline_folder() and get_off_image_path().

   Expected: len(seeds) * len(prompts) on_*.png files + 1 metadata.jsonl.


.. py:function:: get_shared_baseline_folder(task: Literal['scenes', 'objects', 'breeds', 'people'], base_folder: str = 'assets') -> str

   Return the task-level shared baseline folder path.

   A single shared folder per task holds ALL method-agnostic baseline images
   (generated by 0_generate_dataset_original.py, run once per task, with no LoRA).
   Images are independent of which entity is being forgotten, so one folder serves
   all entities and all methods.

   Convention: assets/datasets/generated_{task}_baseline/


.. py:function:: get_off_image_path(task: Literal['scenes', 'objects', 'breeds', 'people'], target: str, method: Literal['munba', 'uce', 'distil'], num_train_epochs: int, seed: int, prompt: str, base_folder: str = 'assets', seeds: Optional[List[int]] = None, prompts: Optional[List[str]] = None) -> str

   Return the path to a baseline (lora_state='off') image for a given entity/seed/prompt.

   .. note::
       This module-level function and ``GeneratedDataset.get_off_image_path()``
       (classmethod) provide identical functionality.  Both exist because the
       module-level version predates the ``GeneratedDataset`` class; the classmethod
       delegates to this function.  New code should prefer the classmethod for
       consistency with the OO abstraction, but the module-level function is NOT
       vestigial — it is used by legacy callers and remains the implementation
       backing both entry points.

   Fallback / download cascade:
   1. If the shared task-level baseline folder exists locally, use it (preferred).
   2. If ``seeds`` and ``prompts`` (the *full* task-level lists) are provided and the
      baseline folder is absent locally, attempt to download it from HuggingFace via
      ``GeneratedDataset(task, method=None).compute(seeds, prompts)``.  This mirrors
      the OO cascade: local → HF → scratch.  If HF has the data it is downloaded; if
      not, ``_compute_from_scratch`` is called (which requires the base SD pipeline).
   3. Otherwise fall back to the legacy entity folder (get_generated_dataset_folder),
      which was the pre-refactor location for both on_* and off_* images.

   :param task: Used for the legacy entity-folder fallback (step 3) and to identify the baseline.
   :param target: Used for the legacy entity-folder fallback (step 3) and to identify the baseline.
   :param method: Used for the legacy entity-folder fallback (step 3) and to identify the baseline.
   :param num_train_epochs: Used for the legacy entity-folder fallback (step 3) and to identify the baseline.
   :param seed: Identify the specific image file to return.
   :param prompt: Identify the specific image file to return.
   :param base_folder: Root assets directory.
   :param seeds: Full task-level seed and prompt lists — required for ``exists()`` and
                 ``compute()`` on the shared baseline.  When provided the function will
                 attempt an HF download if the baseline folder is missing locally (step 2).
                 If omitted, the function skips the download attempt and falls back directly
                 to the entity folder (backward-compatible).
   :param prompts: Full task-level seed and prompt lists — required for ``exists()`` and
                   ``compute()`` on the shared baseline.  When provided the function will
                   attempt an HF download if the baseline folder is missing locally (step 2).
                   If omitted, the function skips the download attempt and falls back directly
                   to the entity folder (backward-compatible).


.. py:class:: GeneratedDataset(/, **data: Any)

   Bases: :py:obj:`pydantic.BaseModel`


   Abstraction over generated image dataset folders.

   Represents exactly one dataset folder — either the shared task-level
   baseline or a method-specific entity dataset.

   Folder conventions
   ------------------
   - **Shared baseline** (``method=None``):
       ``assets/datasets/generated_{task}_baseline/``
       All method-agnostic off-images for the whole task live here.
       Generated once per task by 0_generate_dataset_original.py.
   - **Entity dataset** (``method=<str>``, ``target=<str>``):
       ``assets/datasets/generated_{task}_{target}_{method}_{epochs:03d}/``
       Contains ``on_*`` unlearned images (and possibly legacy ``off_*``).

   ``compute()`` resolves data in priority order:
   1. Already complete locally → return immediately.
   2. Present in HuggingFace → download, then return.
   3. Neither → call ``_compute_from_scratch()``, which generates images
      from scratch using the Stable Diffusion pipeline.

   After ``_compute_from_scratch()`` completes, if ``upload_if_recomputed``
   is True the dataset folder is uploaded to HuggingFace.

   The ``get_off_image_path`` class method encapsulates the full fallback
   chain for a baseline image: shared baseline → entity folder.


   .. py:attribute:: task
      :type:  _type_task


   .. py:attribute:: target
      :type:  Optional[str]
      :value: None


   .. py:attribute:: method
      :type:  Optional[_type_method]
      :value: None


   .. py:attribute:: num_train_epochs
      :type:  Optional[int]
      :value: None


   .. py:attribute:: base_folder
      :type:  str
      :value: 'assets'


   .. py:attribute:: remote_repository_name
      :type:  str
      :value: 'LeonardoBenitez/VisionUnlearningEvaluationTestbeds'


   .. py:attribute:: recompute_if_exists
      :type:  bool
      :value: False


   .. py:attribute:: upload_if_recomputed
      :type:  bool
      :value: False


   .. py:method:: _validate_consistency() -> GeneratedDataset


   .. py:property:: is_baseline
      :type: bool


      True when this dataset holds baseline (lora-off) images.


   .. py:property:: folder_path
      :type: str


      Local path to the dataset folder.

      Replaces:
        - get_shared_baseline_folder()
        - get_generated_dataset_folder()


   .. py:property:: hf_config_name
      :type: str


      HuggingFace config / folder name (basename of folder_path).

      This is the bare folder name used for local path computation.
      Use ``hf_path_in_repo`` when you need the full HF-side path.


   .. py:property:: hf_path_in_repo
      :type: str


      Full path inside the HuggingFace repository where this dataset lives.

      All generated datasets (baseline and entity) live under the ``datasets/``
      prefix in the HF repo, matching the convention used by the legacy
      synchronisation notebook (0b. Synchronize.ipynb).

      Example: ``"datasets/generated_breeds_baseline"``


   .. py:method:: file_path(lora_state: Literal['on', 'off'], seed: int, prompt: str) -> str

      Full path to one image file inside this dataset folder.

      Replaces get_generated_dataset_file() when used together with a
      GeneratedDataset instance.

      Note: lora_state='on' is only valid for entity datasets (method set).
            lora_state='off' is valid for all dataset types.


   .. py:method:: exists(seeds: List[int], prompts: List[str]) -> bool

      Return True if all expected images and metadata are present locally.

      For entity datasets, only on_* images are counted (off_* legacy files
      are ignored — same contract as exists_unlearned_dataset()).
      For baseline datasets, only off_* images are counted.

      Replaces exists_unlearned_dataset() for entity datasets and provides
      the equivalent for baseline folders.

      WARNING — shared baseline: The shared baseline folder contains images for
      ALL entities in the task (N_entities * len(seeds) images total), not just
      the entities in the ``prompts`` argument.  This method counts existing
      off_* files and compares against ``len(seeds) * len(prompts)``.

      If ``prompts`` is a partial (subset) list of the full task prompts,
      ``exists()`` will count more images than expected and incorrectly return
      False, triggering a full re-generation.  Always pass the COMPLETE prompt
      list for the task when calling ``exists()`` on a shared baseline dataset.

      For entity datasets this restriction does not apply because the entity
      folder contains only the images for that specific entity.


   .. py:method:: _compute_from_scratch(seeds: List[int], prompts: List[str], batch_size: int = 16) -> str

      Generate images from scratch and return the folder path.

      For the shared baseline (method=None): loads the base SD pipeline once
      and generates all off-images for all (seed, prompt) pairs, storing them
      in folder_path with the ``off_{seed}_{prompt}.png`` filename convention.

      For entity datasets (method set): loads the already-trained unlearned
      model identified by (task, target, method, num_train_epochs) and generates
      on-images.  Raises FileNotFoundError if the trained model does not exist on
      disk — the caller must run 1_unlearn_from_metadata.py first to produce the
      model weights before calling compute().

      In both cases the method returns self.folder_path after generation.

      Note on metadata.jsonl (entity datasets): ``generate_dataset()`` writes
      ``metadata.jsonl`` to ``self.folder_path`` as its last step.  This is
      verified end-to-end for the shared baseline path.  For entity datasets,
      ``generate_dataset()`` itself writes the file in both the LoRA and UCE
      paths (see vision_unlearning/utils/data_generation.py line 165), but the
      unit tests for this method mock ``generate_dataset`` and therefore do not
      exercise the actual file write.  If the ``generate_dataset`` implementation
      changes and stops writing ``metadata.jsonl``, the entity path here would
      silently produce an incomplete dataset.

      :param seeds: Generation seeds.
      :type seeds: list of int
      :param prompts: Text prompts — one per image template, excluding seed variation.
      :type prompts: list of str
      :param batch_size: Number of prompts per pipeline call.  Default 16 (optimal for 8–12 GB
                         VRAM on this hardware; see perf test in PLAN-TASK-2026-05-19-Baseline.md).
      :type batch_size: int


   .. py:method:: compute(seeds: List[int], prompts: List[str], batch_size: int = 16) -> str

      Ensure the dataset is available locally and return its folder path.

      Resolution order:
      1. Already complete locally → return immediately.
      2. Present in HuggingFace → download, return.
      3. Neither → call ``_compute_from_scratch()``.
         After generation completes, if ``upload_if_recomputed=True``, upload
         the folder to HuggingFace.

      :param seeds: Generation seeds.
      :type seeds: list of int
      :param prompts: Text prompts.  For shared baseline datasets, this MUST be the complete
                      prompt list for the task (all entities).  Passing a partial list will
                      cause ``exists()`` to return False and trigger unnecessary re-generation.
                      See ``exists()`` docstring for details.
      :type prompts: list of str
      :param batch_size: Prompts per pipeline call, forwarded to ``_compute_from_scratch()``.
                         Ignored if the data is already available locally or on HuggingFace.
                         Default 16 (optimal for 8–12 GB VRAM; see perf results in
                         PLAN-TASK-2026-05-19-Baseline.md).
      :type batch_size: int

      :returns: The local folder path to the (now complete) dataset.
      :rtype: str


   .. py:method:: get_off_image_path(task: _type_task, target: str, method: _type_method, num_train_epochs: int, seed: int, prompt: str, base_folder: str = 'assets', seeds: Optional[List[int]] = None, prompts: Optional[List[str]] = None) -> str
      :classmethod:


      Return the path to a baseline (lora_state='off') image.

      Fallback / download cascade:
      1. Shared task-level baseline folder present locally (preferred).
      2. If ``seeds`` and ``prompts`` (full task-level lists) are provided and the
         baseline folder is absent, download it from HuggingFace via
         ``GeneratedDataset(task, method=None).compute(seeds, prompts)``.
      3. Legacy entity folder (pre-refactor mixed on_* + off_* format).

      This class method delegates to the module-level get_off_image_path() which
      implements the same cascade.  Both exist; prefer this classmethod for new
      code using GeneratedDataset.

      :param task: Used for the legacy entity-folder fallback (step 3).
      :param target: Used for the legacy entity-folder fallback (step 3).
      :param method: Used for the legacy entity-folder fallback (step 3).
      :param num_train_epochs: Used for the legacy entity-folder fallback (step 3).
      :param seed: Identify the specific image file.
      :param prompt: Identify the specific image file.
      :param base_folder: Root assets directory.
      :param seeds: Full task-level seed and prompt lists.  Required to enable the HF
                    download cascade (step 2).  When omitted the function falls back
                    directly to the entity folder (backward-compatible).
      :param prompts: Full task-level seed and prompt lists.  Required to enable the HF
                      download cascade (step 2).  When omitted the function falls back
                      directly to the entity folder (backward-compatible).


.. py:function:: get_similarity_clip_path(task: Literal['scenes', 'objects', 'breeds', 'people'], base_folder: str = 'assets') -> str

.. py:function:: get_similarity_clip_df(task: Literal['scenes', 'objects', 'breeds', 'people'], base_folder: str = 'assets') -> pandas.DataFrame

.. py:function:: calculate_similarity_clip(task: Literal['scenes', 'objects', 'breeds', 'people'], labels: List[str], base_folder: str = 'assets') -> pandas.DataFrame

.. py:function:: plot_heatmap(df, figsize=None, cmap='viridis', title='Heatmap')

   Plot a heatmap for a square DataFrame with all labels visible.

   :param df: A square DataFrame with same string labels for index and columns.
   :type df: pd.DataFrame
   :param figsize: Figure size (width, height). Increase if labels overlap.
   :type figsize: tuple
   :param cmap: Colormap name for matplotlib.
   :type cmap: str