diff --git a/scripts/services/docker/Dockerfile.git_integration b/scripts/services/docker/Dockerfile.git_integration index 30a5890259..c2a8068d40 100644 --- a/scripts/services/docker/Dockerfile.git_integration +++ b/scripts/services/docker/Dockerfile.git_integration @@ -65,6 +65,7 @@ FROM base AS runner RUN apt-get update && apt-get install -y \ ca-certificates \ git \ + ripgrep \ --no-install-recommends \ && rm -rf /var/lib/apt/lists/* \ && apt-get clean \ diff --git a/services/apps/git_integration/src/crowdgit/models/maintainer_info.py b/services/apps/git_integration/src/crowdgit/models/maintainer_info.py index 5a420567ae..1752999e54 100644 --- a/services/apps/git_integration/src/crowdgit/models/maintainer_info.py +++ b/services/apps/git_integration/src/crowdgit/models/maintainer_info.py @@ -34,3 +34,5 @@ class MaintainerResult(BaseModel): maintainer_file: str | None = None maintainer_info: list[MaintainerInfoItem] | None = None total_cost: float = 0 + candidate_files: list[tuple[str, int]] = [] + ai_suggested_file: str | None = None diff --git a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py index 6c4e532017..808cf2e5e9 100644 --- a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py +++ b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py @@ -1,5 +1,4 @@ import asyncio -import base64 import os import time as time_module from datetime import datetime, time, timezone @@ -20,6 +19,7 @@ ) from crowdgit.enums import ErrorCode, ExecutionStatus, OperationType from crowdgit.errors import ( + CommandExecutionError, CrowdGitError, MaintainerFileNotFoundError, MaintainerIntervalNotElapsedError, @@ -37,7 +37,7 @@ from crowdgit.models.service_execution import ServiceExecution from crowdgit.services.base.base_service import BaseService from crowdgit.services.maintainer.bedrock import invoke_bedrock -from crowdgit.services.utils import parse_repo_url +from crowdgit.services.utils import run_shell_command from crowdgit.settings import MAINTAINER_RETRY_INTERVAL_DAYS, MAINTAINER_UPDATE_INTERVAL_HOURS @@ -45,25 +45,92 @@ class MaintainerService(BaseService): """Service for processing maintainer data""" MAX_CHUNK_SIZE = 5000 - MAX_CONCURRENT_CHUNKS = 3 # Maximum concurrent chunk processing - # List of common maintainer file names - MAINTAINER_FILES = [ - "MAINTAINERS", - "MAINTAINERS.md", - "MAINTAINER.md", - "CODEOWNERS.md", - "CONTRIBUTORS", - "CONTRIBUTORS.md", - "docs/MAINTAINERS.md", - "OWNERS", - "CODEOWNERS", - ".github/MAINTAINERS.md", - ".github/CONTRIBUTORS.md", - "GOVERNANCE.md", - "README.md", + MAX_CONCURRENT_CHUNKS = 3 + MAX_AI_FILE_LIST_SIZE = 300 + + # Full paths that get the highest score bonus when matched exactly + KNOWN_PATHS = { + "maintainers", + "maintainers.md", + "maintainer.md", + "codeowners", + "codeowners.md", + "contributors", + "contributors.md", + "owners", + "owners.md", + "authors", + "authors.md", + "governance.md", + "docs/maintainers.md", + ".github/maintainers.md", + ".github/contributors.md", + ".github/codeowners", "SECURITY-INSIGHTS.md", + } + + # Governance stems (basename without extension, lowercased) for filename search + GOVERNANCE_STEMS = { + "maintainers", + "maintainer", + "codeowners", + "codeowner", + "contributors", + "contributor", + "owners", + "owners_aliases", + "authors", + "committers", + "commiters", + "reviewers", + "approvers", + "administrators", + "stewards", + "credits", + "governance", + "core_team", + "code_owners", + "emeritus", + "workgroup", + } + + VALID_EXTENSIONS = { + "", + ".md", + ".markdown", + ".txt", + ".rst", + ".yaml", + ".yml", + ".toml", + ".adoc", + ".csv", + ".rdoc", + } + + SCORING_KEYWORDS = [ + "maintainer", + "codeowner", + "owner", + "contributor", + "governance", + "steward", + "emeritus", + "approver", + "reviewer", ] + EXCLUDED_FILENAMES = { + "contributing.md", + "contributing", + "code_of_conduct.md", + "code-of-conduct.md", + } + + FULL_PATH_SCORE = 100 + STEM_MATCH_SCORE = 50 + PARTIAL_STEM_SCORE = 25 + def make_role(self, title: str): title = title.lower() title = ( @@ -124,14 +191,18 @@ async def compare_and_update_maintainers( for github_username, maintainer in new_maintainers_dict.items(): role = maintainer.normalized_title original_role = self.make_role(maintainer.title) - if github_username == "unknown": + if github_username == "unknown" and maintainer.email in ("unknown", None): self.logger.warning( - f"Skipping unkown github_username with title {maintainer.title}" + f"Skipping unknown github_username & email with title {maintainer.title}" ) continue elif github_username not in current_maintainers_dict: # New maintainer - identity_id = await find_github_identity(github_username) + identity_id = ( + await find_github_identity(github_username) + if github_username != "unknown" + else await find_maintainer_identity_by_email(maintainer.email) + ) self.logger.info(f"Found new maintainer {github_username} to be inserted") if identity_id: await upsert_maintainer( @@ -141,7 +212,7 @@ async def compare_and_update_maintainers( f"Successfully inserted new maintainer {github_username} with identity_id {identity_id}" ) else: - # will happend for new users if their identity isn't created yet but should fixed on the next iteration + # will happen for new users if their identity isn't created yet but should be fixed on the next iteration self.logger.warning(f"Identity not found for username: {github_username}") else: # Existing maintainer @@ -234,7 +305,6 @@ def get_extraction_prompt(self, filename: str, content_to_analyze: str) -> str: """ async def analyze_file_content(self, maintainer_filename: str, content: str): - prompt = self.get_extraction_prompt(maintainer_filename, content) if len(content) > self.MAX_CHUNK_SIZE: self.logger.info( "Maintainers file content exceeded max chunk size, splitting into chunks" @@ -278,7 +348,10 @@ async def process_chunk(chunk_index: int, chunk: str): aggregated_info.cost += chunk_info.cost maintainer_info = aggregated_info else: - maintainer_info = await invoke_bedrock(prompt, pydantic_model=MaintainerInfo) + maintainer_info = await invoke_bedrock( + self.get_extraction_prompt(maintainer_filename, content), + pydantic_model=MaintainerInfo, + ) self.logger.info("Maintainers file content analyzed by AI") self.logger.info(f"Maintainers response: {maintainer_info}") if maintainer_info.output.info is not None: @@ -299,33 +372,40 @@ async def process_chunk(chunk_index: int, chunk: str): ai_cost=maintainer_info.cost, ) - def get_maintainer_file_prompt(self, example_files: list[str], file_names: list[str]) -> str: + def get_maintainer_file_prompt( + self, example_files: list[str], candidates: list[tuple[str, int]] + ) -> str: """ Generates the prompt for the LLM to identify a maintainer file from a list. + candidates: list of (filename, score) where score reflects name-match strength. """ example_files_str = "\n".join(f"- {name}" for name in example_files) - file_names_str = "\n".join(f"- {name}" for name in file_names) + candidates_str = "\n".join(f"- {name} [score={score}]" for name, score in candidates) return f""" - You are an expert AI assistant specializing in identifying repository governance files. Your task is to find a maintainer file from a given list of filenames. + You are an expert AI assistant specializing in identifying repository governance files. Your task is to find the single best maintainer file from a given list of candidates. - 1. **Analyze the Input**: Carefully review the list of filenames provided in the `` tag. - 2. **Identify a Maintainer File**: Compare each filename against the characteristics of a maintainer file. These files typically define project ownership, governance, or code owners. Use the `` as a guide. - 3. **Apply Rules**: Follow all constraints listed in the `` section, especially the exclusion rule. - 4. **Select the First Match**: Scan the list and select the *first* filename that you identify as a maintainer file. You only need to find one. Once a match is found, stop searching. + 1. **Analyze the Input**: Carefully review the list of candidates in the `` tag. Each entry shows the file path and a pre-computed name-match score. + 2. **Identify the Best Maintainer File**: Compare each candidate against the characteristics of a maintainer file. These files typically define project ownership, governance, or code owners. Use the `` as a guide. + 3. **Use Signals to Rank**: When multiple candidates qualify, prefer: + - Higher **score** — stronger filename match against known governance patterns. + - Fewer path separators (`/`) in the path — files closer to the repo root apply to the whole project; deeply nested files are usually component-specific. + - When score and nesting conflict, prefer the file most likely to be the repo-wide governance file. + 4. **Apply Rules**: Follow all constraints listed in the `` section. 5. **Format the Output**: Return your answer as a single JSON object according to the `` specification, and nothing else. - **Definition**: A maintainer file's name usually contains keywords like `MAINTAINERS`, `CODEOWNERS`, or `OWNERS`. - **Exclusion**: The filename `CONTRIBUTING.md` must ALWAYS be ignored and never selected, even if it's the only file that seems relevant. + - **Third-party exclusion**: Do NOT select files that are inside directories associated with vendored dependencies, third-party libraries, or packages consumed by the project (e.g. paths containing `vendor/`, `node_modules/`, `third_party/`, `external/`, `.cache/`, `dist/`, `site-packages/`). These files belong to external projects, not this repository's own governance. - **No Match**: If no file in the list matches the criteria after checking all of them, you must return the 'not_found' error. - **Empty Input**: If the `` is empty or contains no filenames, you must return the 'not_found' error. - - **If a maintainer file is found**: Return a JSON object in the format `{{"file_name": ""}}`. + - **If a maintainer file is found**: Return a JSON object in the format `{{"file_name": ""}}`. - **If no maintainer file is found**: Return a JSON object in the format `{{"error": "not_found"}}`. @@ -334,15 +414,18 @@ def get_maintainer_file_prompt(self, example_files: list[str], file_names: list[ - {file_names_str} + {candidates_str} Return only the final JSON object. """ - async def find_maintainer_file_with_ai(self, file_names): + async def find_maintainer_file_with_ai( + self, candidates: list[tuple[str, int]] + ) -> tuple[str | None, float]: + """Ask AI to select the best maintainer file from scored candidates.""" self.logger.info("Using AI to find maintainer files...") - prompt = self.get_maintainer_file_prompt(self.MAINTAINER_FILES, file_names) + prompt = self.get_maintainer_file_prompt(sorted(self.KNOWN_PATHS), candidates) result = await invoke_bedrock(prompt, pydantic_model=MaintainerFile) if result.output.file_name is not None: @@ -351,74 +434,255 @@ async def find_maintainer_file_with_ai(self, file_names): else: return None, result.cost - async def find_maintainer_file(self, repo_path: str, owner: str, repo: str): - self.logger.info(f"Looking for maintainer files in {owner}/{repo}...") - - file_names = await aiofiles.os.listdir(repo_path) - - for file in self.MAINTAINER_FILES: - file_path = os.path.join(repo_path, file) - if await aiofiles.os.path.isfile(file_path): - self.logger.info(f"maintainer file: {file_path} found in repo") - async with aiofiles.open(file_path, "r", encoding="utf-8") as f: - content = await f.read() - - if file.lower() == "readme.md" and "maintainer" not in content.lower(): - self.logger.info(f"Skipping {file}: no maintainer-related content found") - continue + async def _list_repo_files(self, repo_path: str) -> list[str]: + """List non-code files in the repo recursively, filtered by VALID_EXTENSIONS.""" + glob_args = ["--glob", "!.git/"] + for ext in self.VALID_EXTENSIONS: + glob_args.extend(["--iglob", f"*{ext}"]) - return file, base64.b64encode(content.encode()).decode(), 0 + output = await run_shell_command( + ["rg", "--files", "--hidden", *glob_args, "."], cwd=repo_path + ) + return [ + line[2:] if line.startswith("./") else line + for line in output.strip().split("\n") + if line.strip() + ] - self.logger.warning("No maintainer files found using the known file names.") + async def _ripgrep_search(self, repo_path: str) -> list[str]: + """Search for files whose basename matches a governance stem, at any depth.""" + glob_args = ["--glob", "!.git/"] + for stem in self.GOVERNANCE_STEMS: + glob_args.extend(["--iglob", f"*{stem}*"]) - file_name, ai_cost = await self.find_maintainer_file_with_ai(file_names) + try: + output = await run_shell_command( + ["rg", "--files", "--hidden", *glob_args, "."], cwd=repo_path + ) + except CommandExecutionError: + self.logger.info("Ripgrep found no governance files by filename") + return [] + except Exception as e: + self.logger.warning(f"Ripgrep search failed: {repr(e)}") + return [] - if file_name: - file_path = os.path.join(repo_path, file_name) - if await aiofiles.os.path.isfile(file_path): + results = [] + for line in output.strip().split("\n"): + line = line.strip() + if not line: + continue + if line.startswith("./"): + line = line[2:] + basename = os.path.basename(line).lower() + if basename in self.EXCLUDED_FILENAMES: + continue + ext = os.path.splitext(basename)[1] + if ext not in self.VALID_EXTENSIONS: + continue + results.append(line) + + self.logger.info(f"Ripgrep found {len(results)} governance files by filename") + return results + + def _score_filename(self, candidate_path: str) -> int: + """Score by how closely the filename matches known governance patterns.""" + path = candidate_path.lower() + if path in self.KNOWN_PATHS: + return self.FULL_PATH_SCORE + stem = os.path.splitext(os.path.basename(path))[0].lstrip(".") + if stem in self.GOVERNANCE_STEMS: + return self.STEM_MATCH_SCORE + if any(known_stem in stem for known_stem in self.GOVERNANCE_STEMS): + return self.PARTIAL_STEM_SCORE + return 0 + + async def find_candidate_files(self, repo_path: str) -> list[tuple[str, str, int]]: + """ + Find governance files by filename, score them, and return all candidates sorted by score. + Scoring: full known-path match (100) > exact stem (50) > partial stem (25) + content keywords (+1 each). + """ + found_paths = await self._ripgrep_search(repo_path) + if not found_paths: + return [] + + scored = [] + for candidate_path in found_paths: + file_path = os.path.join(repo_path, candidate_path) + try: async with aiofiles.open(file_path, "r", encoding="utf-8") as f: content = await f.read() + except Exception as e: + self.logger.warning(f"Failed to read candidate {candidate_path}: {repr(e)}") + continue - if file_name.lower() == "readme.md" and "maintainer" not in content.lower(): - self.logger.info( - f"AI suggested {file_name}, but it has no maintainer-related content. Skipping." - ) - return None, None, ai_cost + filename_score = self._score_filename(candidate_path) + content_score = sum(1 for kw in self.SCORING_KEYWORDS if kw in content.lower()) + total = filename_score + content_score - self.logger.info(f"\nMaintainer file found: {file_name}") - return file_name, base64.b64encode(content.encode()).decode(), ai_cost + scored.append((candidate_path, content, total)) + self.logger.info( + f"Candidate: {candidate_path} " + f"(filename: {filename_score}, content: {content_score}, total: {total})" + ) - return None, None, ai_cost + scored.sort(key=lambda c: c[2], reverse=True) - async def extract_maintainers(self, repo_path: str, owner: str, repo: str): - total_cost = 0 + if scored: + self.logger.info(f"Top candidate: {scored[0][0]} (from {len(scored)} total)") + else: + self.logger.info("No valid candidates after scoring") + return scored - self.logger.info("Looking for maintainer file...") - maintainer_file, file_content, cost = await self.find_maintainer_file( - repo_path, owner, repo - ) - total_cost += cost + async def analyze_and_build_result(self, filename: str, content: str) -> MaintainerResult: + """ + Analyze file content with AI and return a MaintainerResult. + Raises MaintanerAnalysisError if no maintainers are found. + """ + self.logger.info(f"Analyzing maintainer file: {filename}") + if "readme" in filename.lower() and "maintainer" not in content.lower(): + self.logger.warning( + f"Skipping README file '{filename}': no 'maintainer' keyword found in content" + ) + raise MaintanerAnalysisError(error_code=ErrorCode.NO_MAINTAINER_FOUND) + result = await self.analyze_file_content(filename, content) - if not maintainer_file or not file_content: - self.logger.error("No maintainer file found") - raise MaintainerFileNotFoundError(ai_cost=total_cost) + if not result.output.info: + raise MaintanerAnalysisError(ai_cost=result.cost) - decoded_content = base64.b64decode(file_content).decode("utf-8") + return MaintainerResult( + maintainer_file=filename, + maintainer_info=result.output.info, + total_cost=result.cost, + ) + + async def try_saved_maintainer_file( + self, repo_path: str, saved_maintainer_file: str + ) -> tuple[MaintainerResult | None, float]: + """ + Attempt to read and analyze the previously saved maintainer file. + Returns (result, cost) where result is None if the attempt failed. + """ + cost = 0.0 + file_path = os.path.join(repo_path, saved_maintainer_file) - self.logger.info(f"Analyzing maintainer file: {maintainer_file}") - result = await self.analyze_file_content(maintainer_file, decoded_content) - maintainer_info = result.output.info - total_cost += result.cost + if not await aiofiles.os.path.isfile(file_path): + self.logger.warning( + f"Saved maintainer file '{saved_maintainer_file}' no longer exists on disk" + ) + return None, cost - if not maintainer_info: - self.logger.error("Failed to analyze the maintainer file content.") - raise MaintanerAnalysisError(ai_cost=total_cost) + try: + async with aiofiles.open(file_path, "r", encoding="utf-8") as f: + content = await f.read() + + result = await self.analyze_and_build_result(saved_maintainer_file, content) + cost += result.total_cost + return result, cost + except MaintanerAnalysisError as e: + cost += e.ai_cost + self.logger.warning( + f"Saved maintainer file '{saved_maintainer_file}' analysis failed: {e.error_message}" + ) + return None, cost + except Exception as e: + self.logger.warning( + f"Saved maintainer file '{saved_maintainer_file}' processing failed: {repr(e)}" + ) + return None, cost - return MaintainerResult( - maintainer_file=maintainer_file, - maintainer_info=maintainer_info, - total_cost=total_cost, + async def extract_maintainers( + self, + repo_path: str, + saved_maintainer_file: str | None = None, + ): + total_cost = 0 + candidate_files: list[tuple[str, int]] = [] + ai_suggested_file: str | None = None + + def _attach_metadata(result: MaintainerResult) -> MaintainerResult: + result.total_cost = total_cost + result.candidate_files = candidate_files + result.ai_suggested_file = ai_suggested_file + return result + + # Step 1: Try the previously saved maintainer file + if saved_maintainer_file: + self.logger.info(f"Trying saved maintainer file: {saved_maintainer_file}") + result, cost = await self.try_saved_maintainer_file(repo_path, saved_maintainer_file) + total_cost += cost + if result: + return _attach_metadata(result) + self.logger.info("Falling back to maintainer file detection") + + # Step 2: Find top candidate via filename search + scoring + candidates = await self.find_candidate_files(repo_path) + candidate_files = [(path, score) for path, _, score in candidates][:100] + + # Step 3: Try AI analysis on top candidate + failed_candidate: str | None = None + if candidates: + filename, content, _ = candidates[0] + try: + result = await self.analyze_and_build_result(filename, content) + total_cost += result.total_cost + return _attach_metadata(result) + except MaintanerAnalysisError as e: + total_cost += e.ai_cost + self.logger.warning(f"AI analysis failed for '{filename}': {e.error_message}") + except Exception as e: + self.logger.warning(f"Unexpected error analyzing '{filename}': {repr(e)}") + + failed_candidate = filename + self.logger.warning("Top candidate failed, trying AI file detection") + else: + self.logger.warning("No candidate files found via search, trying AI file detection") + + # Step 4: AI file detection as last resort + file_names = await self._list_repo_files(repo_path) + # Pre-filter to governance-scored files to keep the AI prompt within model limits. + # Fall back to a hard-capped slice of the full list if nothing scores. + # Exclude the already-failed top candidate to avoid re-suggesting it. + scored_tuples = [ + (f, self._score_filename(f)) + for f in file_names + if self._score_filename(f) > 0 and f != failed_candidate + ] + ai_input_files: list[tuple[str, int]] = ( + scored_tuples + if scored_tuples + else [ + (f, 0) for f in file_names[: self.MAX_AI_FILE_LIST_SIZE] if f != failed_candidate + ] ) + self.logger.info( + f"Passing {len(ai_input_files)} files to AI for maintainer file detection " + f"(total repo files: {len(file_names)})" + ) + ai_file_name, ai_cost = await self.find_maintainer_file_with_ai(ai_input_files) + ai_suggested_file = ai_file_name + total_cost += ai_cost + + if ai_file_name: + file_path = os.path.join(repo_path, ai_file_name) + if not await aiofiles.os.path.isfile(file_path): + self.logger.warning( + f"AI suggested '{ai_file_name}' but file does not exist on disk" + ) + else: + try: + async with aiofiles.open(file_path, "r", encoding="utf-8") as f: + content = await f.read() + result = await self.analyze_and_build_result(ai_file_name, content) + total_cost += result.total_cost + return _attach_metadata(result) + except MaintanerAnalysisError as e: + total_cost += e.ai_cost + self.logger.warning( + f"AI-suggested file '{ai_file_name}' analysis failed: {e.error_message}" + ) + + self.logger.error("No maintainer file found") + raise MaintainerFileNotFoundError(ai_cost=total_cost) async def check_if_interval_elapsed(self, repository: Repository) -> tuple[bool, float]: """ @@ -494,10 +758,10 @@ async def process_maintainers( ai_cost = 0.0 maintainers_found = 0 maintainers_skipped = 0 + candidate_files: list[str] = [] + ai_suggested_file: str | None = None try: - owner, repo_name = parse_repo_url(batch_info.remote) - has_interval_elapsed, remaining_hours = await self.check_if_interval_elapsed( repository ) @@ -507,10 +771,15 @@ async def process_maintainers( ) self.logger.info(f"Starting maintainers processing for repo: {batch_info.remote}") - maintainers = await self.extract_maintainers(batch_info.repo_path, owner, repo_name) + maintainers = await self.extract_maintainers( + batch_info.repo_path, + saved_maintainer_file=repository.maintainer_file, + ) latest_maintainer_file = maintainers.maintainer_file ai_cost = maintainers.total_cost maintainers_found = len(maintainers.maintainer_info) + candidate_files = maintainers.candidate_files + ai_suggested_file = maintainers.ai_suggested_file if repository.parent_repo: filtered_maintainers = await self.exclude_parent_repo_maintainers( @@ -565,6 +834,8 @@ async def process_maintainers( "ai_cost": ai_cost, "maintainers_found": maintainers_found, "maintainers_skipped": maintainers_skipped, + "candidate_files": candidate_files, + "ai_suggested_file": ai_suggested_file, }, ) await save_service_execution(service_execution)