Sync in commit data

This commit is contained in:
Zach Latta
2025-05-14 17:35:41 -04:00
parent 1315086a04
commit a872ab3198
8 changed files with 253 additions and 1 deletions

View File

@@ -0,0 +1,14 @@
class Avo::Resources::Commit < Avo::BaseResource
# self.includes = []
# self.attachments = []
# self.search = {
# query: -> { query.ransack(id_eq: params[:q], m: "or").result(distinct: false) }
# }
def fields
field :id, as: :id
field :sha, as: :text
field :user, as: :belongs_to
field :github_raw, as: :code
end
end

View File

@@ -0,0 +1,4 @@
# This controller has been generated to enable Rails' resource routes.
# More information on https://docs.avohq.io/3.0/controllers.html
class Avo::CommitsController < Avo::ResourcesController
end

View File

@@ -0,0 +1,118 @@
require 'http'
require 'json'
class ProcessCommitJob < ApplicationJob
queue_as :literally_whenever
# Retry on common network issues or temporary API errors
retry_on HTTP::TimeoutError, HTTP::ConnectionError, wait: :exponentially_longer, attempts: 5
retry_on JSON::ParserError, wait: 10.seconds, attempts: 3 # If API returns malformed JSON
discard_on ActiveJob::DeserializationError # If User record is gone
def perform(user_id, commit_sha, commit_api_url, provider_string)
provider_sym = provider_string.to_sym # Convert string back to symbol
user = User.find_by(id: user_id)
unless user
Rails.logger.warn "[ProcessCommitJob] User ##{user_id} not found. Skipping commit #{commit_sha}."
return
end
# Idempotency: Check if commit already exists
if Commit.exists?(sha: commit_sha)
# Rails.logger.info "[ProcessCommitJob] Commit #{commit_sha} already exists. Skipping."
# Optionally, you could update provider-specific raw data here if it's from a different provider
# and the commit record already exists (e.g., adding gitlab_raw to an existing commit)
return
end
Rails.logger.info "[ProcessCommitJob] Processing commit #{commit_sha} for User ##{user_id} via #{provider_sym} from URL: #{commit_api_url}"
case provider_sym
when :github
process_github_commit(user, commit_sha, commit_api_url)
# Add other providers like :gitlab later
# when :gitlab
# process_gitlab_commit(user, commit_sha, commit_api_url)
else
Rails.logger.error "[ProcessCommitJob] Unknown provider '#{provider_sym}' for commit #{commit_sha}."
end
end
private
def process_github_commit(user, commit_sha, commit_api_url)
unless user.github_access_token.present?
Rails.logger.warn "[ProcessCommitJob] User ##{user.id} missing GitHub token for commit #{commit_sha}. Skipping."
return
end
begin
response = HTTP.headers(
"Accept" => "application/vnd.github.v3+json",
"Authorization" => "Bearer #{user.github_access_token}",
"X-GitHub-Api-Version" => "2022-11-28"
).timeout(connect: 5, read: 10).get(commit_api_url)
if response.status.success?
commit_data_json = response.parse
api_commit_sha = commit_data_json['sha']
unless api_commit_sha == commit_sha
Rails.logger.error "[ProcessCommitJob] SHA mismatch for User ##{user.id}. Expected #{commit_sha}, API returned #{api_commit_sha}. URL: #{commit_api_url}"
return # Critical data integrity issue
end
committer_date_str = commit_data_json.dig('commit', 'committer', 'date')
unless committer_date_str
Rails.logger.error "[ProcessCommitJob] Committer date not found in API response for commit #{commit_sha}. Data: #{commit_data_json.inspect}"
return
end
begin
# API dates are typically ISO8601 (UTC). Time.zone.parse respects the application's zone.
# It's good practice to store in UTC, which parse will do correctly for ISO8601.
commit_actual_created_at = Time.zone.parse(committer_date_str)
rescue ArgumentError
Rails.logger.error "[ProcessCommitJob] Invalid committer date format '#{committer_date_str}' for commit #{commit_sha}."
return
end
Commit.create!(
sha: api_commit_sha,
user_id: user.id,
github_raw: commit_data_json,
created_at: commit_actual_created_at, # Manually set created_at
updated_at: Time.current # Let Rails handle updated_at, or set explicitly
)
Rails.logger.info "[ProcessCommitJob] Successfully stored commit #{api_commit_sha} for User ##{user.id}."
elsif response.status.code == 404
Rails.logger.warn "[ProcessCommitJob] Commit #{commit_sha} not found (404) at #{commit_api_url} for User ##{user.id}."
elsif response.status.code == 403 # Forbidden, could be rate limit or permissions
if response.headers['X-RateLimit-Remaining'].to_i == 0
reset_time = Time.at(response.headers['X-RateLimit-Reset'].to_i)
delay_seconds = [(reset_time - Time.current).ceil, 5].max # at least 5s delay
Rails.logger.warn "[ProcessCommitJob] GitHub API rate limit exceeded for User ##{user.id}. Retrying in #{delay_seconds}s. URL: #{commit_api_url}"
self.class.set(wait: delay_seconds.seconds).perform_later(user.id, commit_sha, commit_api_url, 'github')
else
Rails.logger.error "[ProcessCommitJob] GitHub API forbidden (403) for User ##{user.id}. URL: #{commit_api_url}. Response: #{response.body.to_s.truncate(500)}"
end
else
Rails.logger.error "[ProcessCommitJob] GitHub API error for User ##{user.id}. Status: #{response.status}. URL: #{commit_api_url}. Response: #{response.body.to_s.truncate(500)}"
raise "GitHub API Error: Status #{response.status}" if response.status.server_error? || response.status.code == 401 # Trigger retry for server errors or auth issues
end
rescue HTTP::Error => e # Covers TimeoutError, ConnectionError
Rails.logger.error "[ProcessCommitJob] HTTP Error fetching commit #{commit_sha} for User ##{user.id}: #{e.message}. URL: #{commit_api_url}"
raise # Re-raise to allow GoodJob to retry based on retry_on
rescue JSON::ParserError => e
Rails.logger.error "[ProcessCommitJob] JSON Parse Error for commit #{commit_sha} (User ##{user.id}): #{e.message}. URL: #{commit_api_url}. Body: #{response&.body&.to_s&.truncate(200)}"
# Malformed JSON usually isn't temporary, so might not retry unless API is known to be flaky.
rescue ActiveRecord::RecordInvalid => e
Rails.logger.error "[ProcessCommitJob] Validation failed for commit #{commit_sha} (User ##{user.id}): #{e.message}. Data to save: sha=#{api_commit_sha}, user_id=#{user.id}, created_at=#{commit_actual_created_at}"
# This indicates a local data or logic issue, usually not retried.
end
end
end

View File

@@ -0,0 +1,75 @@
class ScanRepoEventsForCommitsJob < ApplicationJob
queue_as :low_priority # This can be a less frequent, background task
include GoodJob::ActiveJobExtensions::Concurrency
good_job_control_concurrency_with(
total_limit: 1, # Only one instance of this job should run at a time
key: -> { self.class.name },
drop: true # If another instance is running or queued, drop this one
)
def perform
Rails.logger.info "[ScanRepoEventsForCommitsJob] Starting scan of RepoHostEvents for new commits."
# Determine the lookback window. Consider events from the last N days.
# If you have a way to track processed events (e.g., a new column on RepoHostEvent),
# you could use that. For now, we'll use a time window and rely on Commit.exists?
# to avoid re-processing.
time_window_start = 90.days.ago
# Process events in batches to manage memory
# Filter for GitHub PushEvents initially
RepoHostEvent
.where(provider: RepoHostEvent.providers[:github])
.where("raw_event_payload->>'type' = ?", 'PushEvent') # Efficiently query JSONB
.where("created_at >= ?", time_window_start) # Focus on recent events
.order(created_at: :desc) # Process newer events first, potentially stopping earlier
.find_each(batch_size: 100) do |event|
process_event(event)
end
Rails.logger.info "[ScanRepoEventsForCommitsJob] Finished scan."
end
private
def process_event(event)
user = event.user
unless user
Rails.logger.warn "[ScanRepoEventsForCommitsJob] Event ID #{event.id} has no associated user. Skipping."
return
end
payload = event.raw_event_payload
# Safely access nested commit data from the JSON payload
commits_data = payload.dig('payload', 'commits')
unless commits_data.is_a?(Array) && commits_data.any?
# Rails.logger.debug "[ScanRepoEventsForCommitsJob] Event ID #{event.id} (User ##{user.id}) is a PushEvent but has no commits. Skipping."
return
end
commits_data.each do |commit_info|
commit_sha = commit_info['sha']
# The 'url' in the PushEvent's commit object is the API URL for that commit
commit_api_url = commit_info['url']
if commit_sha.blank? || commit_api_url.blank?
Rails.logger.warn "[ScanRepoEventsForCommitsJob] Event ID #{event.id} (User ##{user.id}) has a commit with missing SHA or API URL. Info: #{commit_info.inspect}"
next
end
# Main check: Only enqueue if the commit SHA is not already in the Commit table.
# This is crucial for idempotency and efficiency.
unless Commit.exists?(sha: commit_sha)
Rails.logger.info "[ScanRepoEventsForCommitsJob] Enqueuing ProcessCommitJob for SHA #{commit_sha}, User ##{user.id}, Provider #{event.provider}."
ProcessCommitJob.perform_later(user.id, commit_sha, commit_api_url, event.provider.to_s)
end
end
rescue JSON::ParserError => e
Rails.logger.error "[ScanRepoEventsForCommitsJob] Failed to parse raw_event_payload for Event ID #{event.id}: #{e.message}"
rescue => e # Catch other potential errors during event processing
Rails.logger.error "[ScanRepoEventsForCommitsJob] Error processing Event ID #{event.id}: #{e.message}\n#{e.backtrace.take(5).join("\n")}"
end
end

17
app/models/commit.rb Normal file
View File

@@ -0,0 +1,17 @@
class Commit < ApplicationRecord
# Explicitly set 'sha' as the primary key for ActiveRecord.
# This is crucial because we defined it as such in the migration.
self.primary_key = :sha
belongs_to :user
validates :sha, presence: true, uniqueness: true
validates :user_id, presence: true
# `github_raw` could be validated for presence if a commit record implies it must have GitHub data.
# validates :github_raw, presence: true
# Note on timestamps:
# Rails will automatically manage `updated_at`.
# We will manually set `created_at` when creating a record,
# based on the `committer.date` from the API.
end

View File

@@ -58,6 +58,11 @@ Rails.application.configure do
class: "SyncAllUserRepoEventsJob",
description: "Periodically syncs repository events for all eligible users."
},
scan_repo_events_for_commits: {
cron: "0 */3 * * *", # Every 3 hours at minute 0
class: "ScanRepoEventsForCommitsJob",
description: "Scans repository host events (PushEvents) and enqueues jobs to process new commits."
},
cleanup_expired_email_verification_requests: {
cron: "* * * * *",
class: "CleanupExpiredEmailVerificationRequestsJob"

View File

@@ -0,0 +1,10 @@
class CreateCommits < ActiveRecord::Migration[8.0]
def change
create_table :commits, primary_key: :sha, id: :string do |t|
t.references :user, null: false, foreign_key: true
t.jsonb :github_raw
t.timestamps null: false
end
end
end

11
db/schema.rb generated
View File

@@ -10,7 +10,7 @@
#
# It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema[8.0].define(version: 2025_05_14_180503) do
ActiveRecord::Schema[8.0].define(version: 2025_05_14_212714) do
create_schema "pganalyze"
# These are extensions that must be enabled in order to support this database
@@ -72,6 +72,14 @@ ActiveRecord::Schema[8.0].define(version: 2025_05_14_180503) do
t.index ["user_id"], name: "index_api_keys_on_user_id"
end
create_table "commits", primary_key: "sha", id: :string, force: :cascade do |t|
t.bigint "user_id", null: false
t.jsonb "github_raw"
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.index ["user_id"], name: "index_commits_on_user_id"
end
create_table "email_addresses", force: :cascade do |t|
t.string "email"
t.bigint "user_id", null: false
@@ -389,6 +397,7 @@ ActiveRecord::Schema[8.0].define(version: 2025_05_14_180503) do
end
add_foreign_key "api_keys", "users"
add_foreign_key "commits", "users"
add_foreign_key "email_addresses", "users"
add_foreign_key "email_verification_requests", "users"
add_foreign_key "heartbeats", "raw_heartbeat_uploads"