From dc2019339df16689013248dc6c78bbde4337794f Mon Sep 17 00:00:00 2001 From: Max Wofford Date: Wed, 28 May 2025 18:20:22 -0400 Subject: [PATCH] Add repo pulling job for more recent events In implementation I'm finding github's repo events are delayed by 1-3 days. --- app/jobs/pull_repo_commits_job.rb | 129 +++++++++++++++++++++++++++++ app/models/project_repo_mapping.rb | 13 +++ 2 files changed, 142 insertions(+) create mode 100644 app/jobs/pull_repo_commits_job.rb diff --git a/app/jobs/pull_repo_commits_job.rb b/app/jobs/pull_repo_commits_job.rb new file mode 100644 index 0000000..394a517 --- /dev/null +++ b/app/jobs/pull_repo_commits_job.rb @@ -0,0 +1,129 @@ +require "http" + +class PullRepoCommitsJob < ApplicationJob + queue_as :literally_whenever + + # Retry on common network issues or temporary API errors + retry_on HTTP::TimeoutError, HTTP::ConnectionError, wait: :exponentially_longer, attempts: 5 + retry_on JSON::ParserError, wait: 10.seconds, attempts: 3 # If API returns malformed JSON + + discard_on ActiveJob::DeserializationError # If User record is gone + + def perform(user_id, owner, repo) + user = User.find_by(id: user_id) + + unless user + Rails.logger.warn "[PullRepoCommitsJob] User ##{user_id} not found. Skipping." + return + end + + unless user.github_access_token.present? + Rails.logger.warn "[PullRepoCommitsJob] User ##{user.id} missing GitHub token. Skipping." + return + end + + Rails.logger.info "[PullRepoCommitsJob] Pulling commits for #{owner}/#{repo} for User ##{user.id}" + + # Get commits from the last 3 days + since_date = 3.days.ago.iso8601 + api_url = "https://api.github.com/repos/#{owner}/#{repo}/commits?since=#{since_date}" + + begin + response = HTTP.headers( + "Accept" => "application/vnd.github.v3+json", + "Authorization" => "Bearer #{user.github_access_token}", + "X-GitHub-Api-Version" => "2022-11-28" + ).timeout(connect: 5, read: 10).get(api_url) + + if response.status.success? + commits_data = response.parse + process_commits(user, commits_data) + elsif response.status.code == 404 + Rails.logger.warn "[PullRepoCommitsJob] Repository #{owner}/#{repo} not found (404) for User ##{user.id}." + elsif response.status.code == 403 # Forbidden, could be rate limit or permissions + if response.headers["X-RateLimit-Remaining"].to_i == 0 + reset_time = Time.at(response.headers["X-RateLimit-Reset"].to_i) + delay_seconds = [ (reset_time - Time.current).ceil, 5 ].max # at least 5s delay + Rails.logger.warn "[PullRepoCommitsJob] GitHub API rate limit exceeded for User ##{user.id}. Retrying in #{delay_seconds}s." + self.class.set(wait: delay_seconds.seconds).perform_later(user.id, owner, repo) + else + Rails.logger.error "[PullRepoCommitsJob] GitHub API forbidden (403) for User ##{user.id}. Response: #{response.body.to_s.truncate(500)}" + end + else + Rails.logger.error "[PullRepoCommitsJob] GitHub API error for User ##{user.id}. Status: #{response.status}. Response: #{response.body.to_s.truncate(500)}" + raise "GitHub API Error: Status #{response.status}" if response.status.server_error? || response.status.code == 401 + end + + rescue HTTP::Error => e + Rails.logger.error "[PullRepoCommitsJob] HTTP Error fetching commits for #{owner}/#{repo} (User ##{user.id}): #{e.message}" + raise # Re-raise to allow GoodJob to retry based on retry_on + rescue JSON::ParserError => e + Rails.logger.error "[PullRepoCommitsJob] JSON Parse Error for #{owner}/#{repo} (User ##{user.id}): #{e.message}" + raise # Re-raise to allow GoodJob to retry based on retry_on + end + end + + private + + def process_commits(user, commits_data) + return if commits_data.empty? + + # Get existing commit SHAs to avoid duplicates + shas_to_check = commits_data.map { |c| c["sha"] }.uniq + existing_shas = Commit.where(sha: shas_to_check).pluck(:sha).to_set + + processed_count = 0 + enqueued_count = 0 + + commits_data.each do |commit_data| + processed_count += 1 + commit_sha = commit_data["sha"] + commit_api_url = commit_data["url"] + + # Skip if commit already exists + next if existing_shas.include?(commit_sha) + + # Get detailed commit info to check author + begin + commit_response = HTTP.headers( + "Accept" => "application/vnd.github.v3+json", + "Authorization" => "Bearer #{user.github_access_token}", + "X-GitHub-Api-Version" => "2022-11-28" + ).timeout(connect: 5, read: 10).get(commit_api_url) + + if commit_response.status.success? + commit_details = commit_response.parse + author = commit_details.dig("author") + + # Check both author ID and login + author_id = author&.dig("id") + author_login = author&.dig("login") + + # Process if either the ID or login matches + if author_id == user.github_uid || author_login == user.github_username + Rails.logger.info "[PullRepoCommitsJob] Enqueuing ProcessCommitJob for SHA #{commit_sha}, User ##{user.id}" + ProcessCommitJob.perform_now( + user.id, + commit_sha, + commit_api_url, + "github" + ) + enqueued_count += 1 + else + Rails.logger.debug "[PullRepoCommitsJob] Skipping commit #{commit_sha} - author ID #{author_id}/login #{author_login} doesn't match user ID #{user.github_uid}/login #{user.github_username}" + end + else + Rails.logger.warn "[PullRepoCommitsJob] Failed to fetch commit details for #{commit_sha}: #{commit_response.status}" + end + rescue HTTP::Error => e + Rails.logger.error "[PullRepoCommitsJob] HTTP Error fetching commit details for #{commit_sha}: #{e.message}" + next + rescue JSON::ParserError => e + Rails.logger.error "[PullRepoCommitsJob] JSON Parse Error for commit details #{commit_sha}: #{e.message}" + next + end + end + + Rails.logger.info "[PullRepoCommitsJob] Processed #{processed_count} commits. Enqueued #{enqueued_count} new ProcessCommitJob(s)." + end +end diff --git a/app/models/project_repo_mapping.rb b/app/models/project_repo_mapping.rb index a5478a0..b2b7792 100644 --- a/app/models/project_repo_mapping.rb +++ b/app/models/project_repo_mapping.rb @@ -20,6 +20,8 @@ class ProjectRepoMapping < ApplicationRecord "<>" ] + after_create :schedule_commit_pull + private def repo_url_exists @@ -27,4 +29,15 @@ class ProjectRepoMapping < ApplicationRecord errors.add(:repo_url, "is not cloneable") end end + + def schedule_commit_pull + # Extract owner and repo name from the URL + # Example URL: https://github.com/owner/repo + if repo_url =~ %r{https?://[^/]+/([^/]+)/([^/]+)\z} + owner = $1 + repo = $2 + Rails.logger.info "[ProjectRepoMapping] Scheduling commit pull for #{owner}/#{repo} for User ##{user_id}" + PullRepoCommitsJob.perform_now(user_id, owner, repo) + end + end end