How to search all the documents in a particular folder in windows? - Python code

Search all the documents in windows OS

In this video it shows the demo of how to search all the documents in a windows folder (including subfolders) for a particular query string. In this demonstration, the app is developed using the Python code.

I hope you like this video. For any questions, suggestions or appreciation please contact us at: https://programmerworld.co/contact/ or email at: programmerworld1990@gmail.com

Details:

Python code:

.\search_app\main.py

import uvicorn
from app.api_interface import app

if __name__ == "__main__":
    uvicorn.run(app, host="127.0.0.1", port=8000)

.\search_app\app\api_interface.py

from fastapi import FastAPI, Query
from pydantic import BaseModel
from app.search_engine import search_in_files

app = FastAPI()

class SearchRequest(BaseModel):
    folder_path: str
    query: str
    extensions: list[str] = None

@app.post("/search")
def search_documents(request: SearchRequest):
    """
    API endpoint to search documents in a folder.
    """
    try:
        results = search_in_files(request.folder_path, request.query, request.extensions)
        return {"status": "success", "results": results}
    except Exception as e:
        return {"status": "error", "message": str(e)}

.\search_app\app\search_engine.py

from typing import List, Dict
import os
from PyPDF2 import PdfReader
from docx import Document
import logging

logging.basicConfig(level=logging.INFO)

def search_in_files(folder_path: str, query: str, extensions: List[str]) -> List[Dict]:
    """
    Searches for a query string in files within a specified folder and subfolders.

    Args:
        folder_path (str): The root folder to search in.
        query (str): The search term to look for in files.
        extensions (List[str]): List of file extensions to search.

    Returns:
        List[Dict]: A list of dictionaries containing file paths and matching content snippets.
    """
    results = []

    for root, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)

            # Filter files by extension
            if any(file.lower().endswith(ext.lower()) for ext in extensions):
                try:
                    content = ""

                    # Log file type being processed
                    logging.info(f"Processing file: {file_path}")

                    # Handle TXT files
                    if file.lower().endswith(".txt"):
                        with open(file_path, "r", encoding="utf-8") as f:
                            content = f.read()

                    # Handle PDF files
                    elif file.lower().endswith(".pdf"):
                        reader = PdfReader(file_path)
                        content = " ".join(page.extract_text() for page in reader.pages)

                    # Handle DOCX files
                    elif file.lower().endswith(".docx"):
                        doc = Document(file_path)
                        content = " ".join(paragraph.text for paragraph in doc.paragraphs)

                    # Unsupported extension logging
                    else:
                        logging.error(f"Unsupported file type: {file_path}")
                        continue

                    # Perform case-insensitive search
                    if query.lower() in content.lower():
                        results.append({
                            "file_path": file_path,
                            "snippet": content[:200],  # Include a snippet of the content
                        })

                except Exception as e:
                    logging.error(f"Error processing {file_path}: {e}")

    return results

.\search_app\test_search.py

import requests

# Define the API endpoint and headers
url = "http://127.0.0.1:8000/search"
headers = {
    "Content-Type": "application/json"
}

# Take folder path and query string as input from the user
folder_path = input("Enter the folder path: ").strip()
query = input("Enter the search query: ").strip()

# Define supported file extensions
extensions = [".txt", ".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt"]

# Prepare the request payload
data = {
    "folder_path": folder_path,
    "query": query,
    "extensions": extensions
}

# Make the POST request
response = requests.post(url, json=data, headers=headers)

# Check if the response was successful
if response.status_code == 200:
    results = response.json().get("results", [])
    if results:
        print("\nFiles containing the query:\n")
        for result in results:
            print(result["file_path"])  # Print the file path
    else:
        print("\nNo files matched the query.")
else:
    print(f"\nError: {response.status_code} - {response.text}")

Screenshots:

The complete python code of the App along with the batch script is shared in the below link:

https://drive.google.com/file/d/1pck47f3URlwe857jdVtElVyxe5umXKjr/view?usp=sharing

How to search all the documents in a particular folder in windows? – Python code

Leave a Reply Cancel reply