Source code for scistag.gitstag.git_scanner

import os
import fnmatch
from stat import S_ISDIR
from typing import List


[docs]class GitScanner: """ Scans a git repository and creates a list of non-ignored files and their size. This is used in the unit test to verify no garbage is committed into the repo. Note that this class is a very minimalistic approach for a coarse validity check and does not fulfill all git standards ignore masks etc. So feel free to use this class but don't complain ;). """ def __init__(self): """ Initiailizer """ self.total_size = 0 "The count of valid directories" self.file_count = 0 "The count of valid files" self.dir_count = 0 "The count of directories" self.dir_list: list[dict] = [] '''The list of all directories parsed. Format: {"path": path, "ignored": false}. Note that only the highest 'level ignored directories will be listed, not the nested ones.''' self.file_list: list[dict] = [] 'The list of all non ignored files. Format: {"filename": name, "size": size_in_bytes}' self.file_list_by_size: list[dict] = [] 'The list of all non ignored files by size. Format: {"filename": name, "size": size_in_bytes}'
[docs] @classmethod def get_is_ignored(cls, name: str, is_dir: bool, ignore_list: list[str]): """ Check is a given name is ignored :param name: The file or directory name :param is_dir: Is the element a directory? :param ignore_list: The ignore list :return: True if the element shall be ignored """ for element in ignore_list: if "dart_tool" in element and "dart_tool" in name: pass if element.endswith("/"): if is_dir and fnmatch.fnmatch(name, element[0:-1]): return True else: if fnmatch.fnmatch(name, element): return True return False
[docs] @classmethod def add_git_ignore(cls, base_path, org_list: list[str], filename: str) -> list[str]: """ Creates an update gitignore mask list for the current directory :param base_path: The base path we are starting in :param org_list: The previous ignore list :param filename: The path of the .gitignore file :return: """ with open(filename, "r") as ignore_file: all_lines = ignore_file.readlines() all_lines = [element.rstrip("\n") for element in all_lines if not element.startswith("#")] all_lines = [element for element in all_lines if not len(element) == 0] all_lines = [os.path.normpath(f"{base_path}/{element}") if element.startswith("/") else f"*/" + element for element in all_lines] return org_list + all_lines
[docs] @classmethod def find_valid_repo_files(cls, path: str, file_list: List, ignore_list: List, dir_list: List) -> None: """ Finds a list of valid files in the current directory. Continues the search in subdirectories :param path: The base path :param file_list: The current list of file :param ignore_list: The ignore list :param dir_list: The list of parsed directories """ cur_content = os.listdir(path) git_ignore_path = f"{path}/.gitignore" if os.path.exists(git_ignore_path): ignore_list = cls.add_git_ignore(path, ignore_list, git_ignore_path) for cur_element in cur_content: full_path = os.path.normpath(f"{path}/{cur_element}") cur_stat = os.stat(full_path) is_dir: bool = S_ISDIR(cur_stat.st_mode) is_ignored = cls.get_is_ignored(full_path, is_dir=is_dir, ignore_list=ignore_list) if is_dir: dir_list.append({"path": full_path, "ignored": is_ignored}) if not is_ignored: if is_dir: cls.find_valid_repo_files(full_path, file_list, ignore_list, dir_list=dir_list) else: file_list.append({"filename": full_path, "size": cur_stat.st_size})
[docs] @classmethod def compute_total_size(cls, filelist: List) -> int: """ Computes the total size of the non ignored files :param filelist: The list of files :return: The size in bytes """ total_size = 0 for element in filelist: total_size += element['size'] return total_size
[docs] def scan(self, path: str): """ Executes a scan on given base directory. The results are stored in the member variables of this object (.file_list, .total_size etc.) :param path: The repository base path """ res_file_list = [] cur_ignore_list = ["*/.git/*"] self.dir_list.clear() self.find_valid_repo_files(path, res_file_list, ignore_list=cur_ignore_list, dir_list=self.dir_list) self.total_size = self.compute_total_size(res_file_list) self.file_list = res_file_list res_file_list = sorted(res_file_list, key=lambda x: x["size"], reverse=True) self.dir_count = len(self.dir_list) self.file_count = len(self.file_list) self.file_list_by_size = res_file_list
[docs] def get_large_files(self, min_size: int, ignore_list: list[str], hard_limit_size: int = -1) -> list[str]: """ Returns all files larger than a given threshold which are not on the ignore list :param min_size: The minimum size in bytes :param hard_limit_size: If this size is exceeded even files on the ignore list will not be ignored. -1 if there is no hard limit. :param ignore_list: Masks of the files to ignore :return: The list of all remaining files """ large_elements = [element for element in self.file_list if element['size'] >= min_size] result_list: list[str] = [] for element in large_elements: skip = False if hard_limit_size == -1 or element['size'] < hard_limit_size: for ignore_entry in ignore_list: if fnmatch.fnmatch(element['filename'], ignore_entry): skip = True break if skip: continue else: result_list.append(element['filename']) return result_list