Source code for auto_research.search.files_management

from __future__ import annotations

import warnings

import fitz


[docs] def sanitize_filename(filename: str) -> str: """ Sanitizes a filename by removing illegal characters that are not allowed in Windows filenames. Args: filename (str): The original filename to be sanitized. Returns: str: The sanitized filename with illegal characters removed and leading/trailing spaces stripped. Example: >>> sanitize_filename("my/file:name?.txt") 'myfilename.txt' """ illegal_chars = ["<", ">", ":", '"', "/", "\\", "|", "?", "*"] for char in illegal_chars: filename = filename.replace(char, "") return filename.strip()
[docs] def is_pdf_uncorrupted(file_path: str) -> bool: """ Checks if a PDF file is uncorrupted by attempting to open it using the `fitz` library. Args: file_path (str): The path to the PDF file to be checked. Returns: bool: True if the PDF is not corrupted and can be opened successfully, False otherwise. Example: >>> is_pdf_uncorrupted("example.pdf") True >>> is_pdf_uncorrupted("corrupted.pdf") Error opening PDF: <error message> False Notes: This function uses the `fitz` library (PyMuPDF) to open the PDF file. If the file cannot be opened, it is assumed to be corrupted, and the function returns False. """ try: doc = fitz.open(file_path) doc.close() return True except Exception as e: warnings.warn(f"Error opening PDF: {e}", UserWarning) return False