diff --git a/app/avo/resources/commit.rb b/app/avo/resources/commit.rb new file mode 100644 index 0000000..5ee00d0 --- /dev/null +++ b/app/avo/resources/commit.rb @@ -0,0 +1,14 @@ +class Avo::Resources::Commit < Avo::BaseResource + # self.includes = [] + # self.attachments = [] + # self.search = { + # query: -> { query.ransack(id_eq: params[:q], m: "or").result(distinct: false) } + # } + + def fields + field :id, as: :id + field :sha, as: :text + field :user, as: :belongs_to + field :github_raw, as: :code + end +end diff --git a/app/controllers/avo/commits_controller.rb b/app/controllers/avo/commits_controller.rb new file mode 100644 index 0000000..50e00c9 --- /dev/null +++ b/app/controllers/avo/commits_controller.rb @@ -0,0 +1,4 @@ +# This controller has been generated to enable Rails' resource routes. +# More information on https://docs.avohq.io/3.0/controllers.html +class Avo::CommitsController < Avo::ResourcesController +end diff --git a/app/jobs/process_commit_job.rb b/app/jobs/process_commit_job.rb new file mode 100644 index 0000000..52d1de0 --- /dev/null +++ b/app/jobs/process_commit_job.rb @@ -0,0 +1,118 @@ +require 'http' +require 'json' + +class ProcessCommitJob < ApplicationJob + queue_as :literally_whenever + + # Retry on common network issues or temporary API errors + retry_on HTTP::TimeoutError, HTTP::ConnectionError, wait: :exponentially_longer, attempts: 5 + retry_on JSON::ParserError, wait: 10.seconds, attempts: 3 # If API returns malformed JSON + + discard_on ActiveJob::DeserializationError # If User record is gone + + def perform(user_id, commit_sha, commit_api_url, provider_string) + provider_sym = provider_string.to_sym # Convert string back to symbol + user = User.find_by(id: user_id) + + unless user + Rails.logger.warn "[ProcessCommitJob] User ##{user_id} not found. Skipping commit #{commit_sha}." + return + end + + # Idempotency: Check if commit already exists + if Commit.exists?(sha: commit_sha) + # Rails.logger.info "[ProcessCommitJob] Commit #{commit_sha} already exists. Skipping." + # Optionally, you could update provider-specific raw data here if it's from a different provider + # and the commit record already exists (e.g., adding gitlab_raw to an existing commit) + return + end + + Rails.logger.info "[ProcessCommitJob] Processing commit #{commit_sha} for User ##{user_id} via #{provider_sym} from URL: #{commit_api_url}" + + case provider_sym + when :github + process_github_commit(user, commit_sha, commit_api_url) + # Add other providers like :gitlab later + # when :gitlab + # process_gitlab_commit(user, commit_sha, commit_api_url) + else + Rails.logger.error "[ProcessCommitJob] Unknown provider '#{provider_sym}' for commit #{commit_sha}." + end + end + + private + + def process_github_commit(user, commit_sha, commit_api_url) + unless user.github_access_token.present? + Rails.logger.warn "[ProcessCommitJob] User ##{user.id} missing GitHub token for commit #{commit_sha}. Skipping." + return + end + + begin + response = HTTP.headers( + "Accept" => "application/vnd.github.v3+json", + "Authorization" => "Bearer #{user.github_access_token}", + "X-GitHub-Api-Version" => "2022-11-28" + ).timeout(connect: 5, read: 10).get(commit_api_url) + + if response.status.success? + commit_data_json = response.parse + + api_commit_sha = commit_data_json['sha'] + unless api_commit_sha == commit_sha + Rails.logger.error "[ProcessCommitJob] SHA mismatch for User ##{user.id}. Expected #{commit_sha}, API returned #{api_commit_sha}. URL: #{commit_api_url}" + return # Critical data integrity issue + end + + committer_date_str = commit_data_json.dig('commit', 'committer', 'date') + unless committer_date_str + Rails.logger.error "[ProcessCommitJob] Committer date not found in API response for commit #{commit_sha}. Data: #{commit_data_json.inspect}" + return + end + + begin + # API dates are typically ISO8601 (UTC). Time.zone.parse respects the application's zone. + # It's good practice to store in UTC, which parse will do correctly for ISO8601. + commit_actual_created_at = Time.zone.parse(committer_date_str) + rescue ArgumentError + Rails.logger.error "[ProcessCommitJob] Invalid committer date format '#{committer_date_str}' for commit #{commit_sha}." + return + end + + Commit.create!( + sha: api_commit_sha, + user_id: user.id, + github_raw: commit_data_json, + created_at: commit_actual_created_at, # Manually set created_at + updated_at: Time.current # Let Rails handle updated_at, or set explicitly + ) + Rails.logger.info "[ProcessCommitJob] Successfully stored commit #{api_commit_sha} for User ##{user.id}." + + elsif response.status.code == 404 + Rails.logger.warn "[ProcessCommitJob] Commit #{commit_sha} not found (404) at #{commit_api_url} for User ##{user.id}." + elsif response.status.code == 403 # Forbidden, could be rate limit or permissions + if response.headers['X-RateLimit-Remaining'].to_i == 0 + reset_time = Time.at(response.headers['X-RateLimit-Reset'].to_i) + delay_seconds = [(reset_time - Time.current).ceil, 5].max # at least 5s delay + Rails.logger.warn "[ProcessCommitJob] GitHub API rate limit exceeded for User ##{user.id}. Retrying in #{delay_seconds}s. URL: #{commit_api_url}" + self.class.set(wait: delay_seconds.seconds).perform_later(user.id, commit_sha, commit_api_url, 'github') + else + Rails.logger.error "[ProcessCommitJob] GitHub API forbidden (403) for User ##{user.id}. URL: #{commit_api_url}. Response: #{response.body.to_s.truncate(500)}" + end + else + Rails.logger.error "[ProcessCommitJob] GitHub API error for User ##{user.id}. Status: #{response.status}. URL: #{commit_api_url}. Response: #{response.body.to_s.truncate(500)}" + raise "GitHub API Error: Status #{response.status}" if response.status.server_error? || response.status.code == 401 # Trigger retry for server errors or auth issues + end + + rescue HTTP::Error => e # Covers TimeoutError, ConnectionError + Rails.logger.error "[ProcessCommitJob] HTTP Error fetching commit #{commit_sha} for User ##{user.id}: #{e.message}. URL: #{commit_api_url}" + raise # Re-raise to allow GoodJob to retry based on retry_on + rescue JSON::ParserError => e + Rails.logger.error "[ProcessCommitJob] JSON Parse Error for commit #{commit_sha} (User ##{user.id}): #{e.message}. URL: #{commit_api_url}. Body: #{response&.body&.to_s&.truncate(200)}" + # Malformed JSON usually isn't temporary, so might not retry unless API is known to be flaky. + rescue ActiveRecord::RecordInvalid => e + Rails.logger.error "[ProcessCommitJob] Validation failed for commit #{commit_sha} (User ##{user.id}): #{e.message}. Data to save: sha=#{api_commit_sha}, user_id=#{user.id}, created_at=#{commit_actual_created_at}" + # This indicates a local data or logic issue, usually not retried. + end + end +end diff --git a/app/jobs/scan_repo_events_for_commits_job.rb b/app/jobs/scan_repo_events_for_commits_job.rb new file mode 100644 index 0000000..b87b344 --- /dev/null +++ b/app/jobs/scan_repo_events_for_commits_job.rb @@ -0,0 +1,75 @@ +class ScanRepoEventsForCommitsJob < ApplicationJob + queue_as :low_priority # This can be a less frequent, background task + + include GoodJob::ActiveJobExtensions::Concurrency + good_job_control_concurrency_with( + total_limit: 1, # Only one instance of this job should run at a time + key: -> { self.class.name }, + drop: true # If another instance is running or queued, drop this one + ) + + def perform + Rails.logger.info "[ScanRepoEventsForCommitsJob] Starting scan of RepoHostEvents for new commits." + + # Determine the lookback window. Consider events from the last N days. + # If you have a way to track processed events (e.g., a new column on RepoHostEvent), + # you could use that. For now, we'll use a time window and rely on Commit.exists? + # to avoid re-processing. + time_window_start = 90.days.ago + + # Process events in batches to manage memory + # Filter for GitHub PushEvents initially + RepoHostEvent + .where(provider: RepoHostEvent.providers[:github]) + .where("raw_event_payload->>'type' = ?", 'PushEvent') # Efficiently query JSONB + .where("created_at >= ?", time_window_start) # Focus on recent events + .order(created_at: :desc) # Process newer events first, potentially stopping earlier + .find_each(batch_size: 100) do |event| + + process_event(event) + end + + Rails.logger.info "[ScanRepoEventsForCommitsJob] Finished scan." + end + + private + + def process_event(event) + user = event.user + unless user + Rails.logger.warn "[ScanRepoEventsForCommitsJob] Event ID #{event.id} has no associated user. Skipping." + return + end + + payload = event.raw_event_payload + # Safely access nested commit data from the JSON payload + commits_data = payload.dig('payload', 'commits') + + unless commits_data.is_a?(Array) && commits_data.any? + # Rails.logger.debug "[ScanRepoEventsForCommitsJob] Event ID #{event.id} (User ##{user.id}) is a PushEvent but has no commits. Skipping." + return + end + + commits_data.each do |commit_info| + commit_sha = commit_info['sha'] + # The 'url' in the PushEvent's commit object is the API URL for that commit + commit_api_url = commit_info['url'] + + if commit_sha.blank? || commit_api_url.blank? + Rails.logger.warn "[ScanRepoEventsForCommitsJob] Event ID #{event.id} (User ##{user.id}) has a commit with missing SHA or API URL. Info: #{commit_info.inspect}" + next + end + + # Main check: Only enqueue if the commit SHA is not already in the Commit table. + # This is crucial for idempotency and efficiency. + unless Commit.exists?(sha: commit_sha) + Rails.logger.info "[ScanRepoEventsForCommitsJob] Enqueuing ProcessCommitJob for SHA #{commit_sha}, User ##{user.id}, Provider #{event.provider}." + ProcessCommitJob.perform_later(user.id, commit_sha, commit_api_url, event.provider.to_s) + end + end + rescue JSON::ParserError => e + Rails.logger.error "[ScanRepoEventsForCommitsJob] Failed to parse raw_event_payload for Event ID #{event.id}: #{e.message}" + rescue => e # Catch other potential errors during event processing + Rails.logger.error "[ScanRepoEventsForCommitsJob] Error processing Event ID #{event.id}: #{e.message}\n#{e.backtrace.take(5).join("\n")}" + end +end diff --git a/app/models/commit.rb b/app/models/commit.rb new file mode 100644 index 0000000..c85c913 --- /dev/null +++ b/app/models/commit.rb @@ -0,0 +1,17 @@ +class Commit < ApplicationRecord + # Explicitly set 'sha' as the primary key for ActiveRecord. + # This is crucial because we defined it as such in the migration. + self.primary_key = :sha + + belongs_to :user + + validates :sha, presence: true, uniqueness: true + validates :user_id, presence: true + # `github_raw` could be validated for presence if a commit record implies it must have GitHub data. + # validates :github_raw, presence: true + + # Note on timestamps: + # Rails will automatically manage `updated_at`. + # We will manually set `created_at` when creating a record, + # based on the `committer.date` from the API. +end diff --git a/config/initializers/good_job.rb b/config/initializers/good_job.rb index e5cc32c..203e0ff 100644 --- a/config/initializers/good_job.rb +++ b/config/initializers/good_job.rb @@ -58,6 +58,11 @@ Rails.application.configure do class: "SyncAllUserRepoEventsJob", description: "Periodically syncs repository events for all eligible users." }, + scan_repo_events_for_commits: { + cron: "0 */3 * * *", # Every 3 hours at minute 0 + class: "ScanRepoEventsForCommitsJob", + description: "Scans repository host events (PushEvents) and enqueues jobs to process new commits." + }, cleanup_expired_email_verification_requests: { cron: "* * * * *", class: "CleanupExpiredEmailVerificationRequestsJob" diff --git a/db/migrate/20250514212714_create_commits.rb b/db/migrate/20250514212714_create_commits.rb new file mode 100644 index 0000000..2088e44 --- /dev/null +++ b/db/migrate/20250514212714_create_commits.rb @@ -0,0 +1,10 @@ +class CreateCommits < ActiveRecord::Migration[8.0] + def change + create_table :commits, primary_key: :sha, id: :string do |t| + t.references :user, null: false, foreign_key: true + t.jsonb :github_raw + + t.timestamps null: false + end + end +end diff --git a/db/schema.rb b/db/schema.rb index 1f7b3b6..b2170da 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -10,7 +10,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema[8.0].define(version: 2025_05_14_180503) do +ActiveRecord::Schema[8.0].define(version: 2025_05_14_212714) do create_schema "pganalyze" # These are extensions that must be enabled in order to support this database @@ -72,6 +72,14 @@ ActiveRecord::Schema[8.0].define(version: 2025_05_14_180503) do t.index ["user_id"], name: "index_api_keys_on_user_id" end + create_table "commits", primary_key: "sha", id: :string, force: :cascade do |t| + t.bigint "user_id", null: false + t.jsonb "github_raw" + t.datetime "created_at", null: false + t.datetime "updated_at", null: false + t.index ["user_id"], name: "index_commits_on_user_id" + end + create_table "email_addresses", force: :cascade do |t| t.string "email" t.bigint "user_id", null: false @@ -389,6 +397,7 @@ ActiveRecord::Schema[8.0].define(version: 2025_05_14_180503) do end add_foreign_key "api_keys", "users" + add_foreign_key "commits", "users" add_foreign_key "email_addresses", "users" add_foreign_key "email_verification_requests", "users" add_foreign_key "heartbeats", "raw_heartbeat_uploads"