From e93d83e4e7970535a3431fcc046e7ea93ef44f03 Mon Sep 17 00:00:00 2001 From: Niko Belokolodov Date: Wed, 15 Oct 2025 12:20:00 +1300 Subject: [PATCH 1/8] Add validation for Snowplow contexts In development env Snowplow contexts will be validated against their schemas --- lib/gitlab/tracking/destinations/snowplow.rb | 6 + .../snowplow_context_validator.rb | 108 +++++++++++++ .../snowplow_context_validator_spec.rb | 152 ++++++++++++++++++ .../tracking/destinations/snowplow_spec.rb | 36 +++++ 4 files changed, 302 insertions(+) create mode 100644 lib/gitlab/tracking/destinations/snowplow_context_validator.rb create mode 100644 spec/lib/gitlab/tracking/destinations/snowplow_context_validator_spec.rb diff --git a/lib/gitlab/tracking/destinations/snowplow.rb b/lib/gitlab/tracking/destinations/snowplow.rb index 6795e8c14df84d..631610500e4357 100644 --- a/lib/gitlab/tracking/destinations/snowplow.rb +++ b/lib/gitlab/tracking/destinations/snowplow.rb @@ -23,9 +23,15 @@ def initialize(destination_configuration = DestinationConfiguration.snowplow_con Kernel.at_exit { tracker.flush(async: false) } end + def verify_context(context) + @context_validator.validate(context) + end + def event(category, action, label: nil, property: nil, value: nil, context: nil) return unless @event_eligibility_checker.eligible?(action) + @context_validator = SnowplowContextValidator.new.validate(context) if Rails.env.development? + tracker.track_struct_event( category: category, action: action, diff --git a/lib/gitlab/tracking/destinations/snowplow_context_validator.rb b/lib/gitlab/tracking/destinations/snowplow_context_validator.rb new file mode 100644 index 00000000000000..fba951cb8a271b --- /dev/null +++ b/lib/gitlab/tracking/destinations/snowplow_context_validator.rb @@ -0,0 +1,108 @@ +# frozen_string_literal: true + +module Gitlab + module Tracking + module Destinations + class SnowplowContextValidator + IGLU_SCHEMA_URL_REGEX = %r{\Aiglu:[a-zA-Z0-9._-]+/[a-zA-Z0-9._-]+/jsonschema/\d+-\d+-\d+\z} + + def validate(context) + return unless context + + if context.is_a?(Array) + context.each { |item| validate(item) } + return + end + + unless context.is_a?(SnowplowTracker::SelfDescribingJson) + log_validation_error("Context must be a SnowplowTracker::SelfDescribingJson object", context) + return + end + + json = context.to_json + validate_against_schema(json[:schema], json[:data]) if schema_validation_enabled? + end + + private + + def schema_validation_enabled? + Rails.env.test? || Rails.env.development? + end + + def valid_iglu_schema_url?(schema_url) + return false unless schema_url.is_a?(String) + + # Example: iglu:com.gitlab/gitlab_standard/jsonschema/1-1-7 + schema_url.match?(IGLU_SCHEMA_URL_REGEX) + end + + def log_validation_error(message, context) + exception = ArgumentError.new("Snowplow context validation error: #{message}") + + Gitlab::ErrorTracking.track_and_raise_for_dev_exception( + exception, + context_class: context.class.name, + context_inspect: context.inspect + ) + end + + def validate_against_schema(schema_url, data) + schema_definition = fetch_schema_from_iglu(schema_url) + return unless schema_definition + + validator = JSONSchemer.schema(schema_definition) + errors = validator.validate(data).to_a + + log_schema_validation_errors(schema_url, data, errors) if errors.any? + end + + def fetch_schema_from_iglu(schema_url) + cache_key = "snowplow:schema:#{schema_url}" + + Rails.cache.fetch(cache_key, expires_in: 1.hour, skip_nil: true) do + fetch_schema_from_iglu_without_cache(schema_url) + end + end + + def fetch_schema_from_iglu_without_cache(schema_url) + url = "https://gitlab-org.gitlab.io/iglu/schemas/#{schema_url.delete_prefix('iglu:')}" + + response = Gitlab::HTTP.get(url, allow_local_requests: true) + + if response.success? + Gitlab::Json.parse(response.body) + else + exception = StandardError.new("Failed to fetch Snowplow schema from Iglu registry") + + Gitlab::ErrorTracking.track_and_raise_for_dev_exception( + exception, + schema_url: schema_url, + http_url: url, + status_code: response.code + ) + nil + end + rescue StandardError => e + Gitlab::ErrorTracking.track_and_raise_for_dev_exception( + e, + schema_url: schema_url, + http_url: url + ) + nil + end + + def log_schema_validation_errors(schema_url, data, errors) + error_messages = errors.map { |error| JSONSchemer::Errors.pretty(error) } + exception = ArgumentError.new("Snowplow context data does not match schema: #{error_messages.join(', ')}") + + Gitlab::ErrorTracking.track_and_raise_for_dev_exception( + exception, + schema_url: schema_url, + data: data, + validation_errors: error_messages + ) + end + end + end + end +end diff --git a/spec/lib/gitlab/tracking/destinations/snowplow_context_validator_spec.rb b/spec/lib/gitlab/tracking/destinations/snowplow_context_validator_spec.rb new file mode 100644 index 00000000000000..f270c3add053f7 --- /dev/null +++ b/spec/lib/gitlab/tracking/destinations/snowplow_context_validator_spec.rb @@ -0,0 +1,152 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe Gitlab::Tracking::Destinations::SnowplowContextValidator, feature_category: :application_instrumentation do + subject(:validator) { described_class.new } + + let(:valid_schema_url) { 'iglu:com.gitlab/gitlab_standard/jsonschema/1-1-7' } + let(:valid_data) { { 'environment' => 'test', 'source' => 'gitlab-rails' } } + let(:valid_context) { SnowplowTracker::SelfDescribingJson.new(valid_schema_url, valid_data) } + + describe '#validate' do + let(:schema_definition) do + { + 'type' => 'object', + 'properties' => { + 'environment' => { 'type' => 'string' }, + 'source' => { 'type' => 'string' } + } + } + end + + before do + stub_request(:get, "https://gitlab-org.gitlab.io/iglu/schemas/#{valid_schema_url.delete_prefix('iglu:')}") + .to_return(status: 200, body: schema_definition.to_json) + end + + context 'with a valid SelfDescribingJson context' do + it 'does not raise an error' do + expect { validator.validate(valid_context) }.not_to raise_error + end + end + + context 'with an array of contexts' do + let(:contexts) { [valid_context, valid_context] } + + it 'validates each context in the array' do + expect { validator.validate(contexts) }.not_to raise_error + end + end + + context 'with nil context' do + it 'returns early without error' do + expect { validator.validate(nil) }.not_to raise_error + end + end + + context 'with invalid context type' do + it 'tracks the error in development' do + expect(Gitlab::ErrorTracking).to receive(:track_and_raise_for_dev_exception) + .with(an_instance_of(ArgumentError), hash_including(:context_class)) + + validator.validate("invalid") + end + end + end + + describe 'schema validation', :freeze_time do + let(:schema_definition) do + { + 'type' => 'object', + 'properties' => { + 'environment' => { 'type' => 'string' }, + 'source' => { 'type' => 'string' } + }, + 'required' => %w[environment source] + } + end + + before do + allow(Rails.env).to receive(:development?).and_return(true) + stub_request(:get, "https://gitlab-org.gitlab.io/iglu/schemas/#{valid_schema_url.delete_prefix('iglu:')}") + .to_return(status: 200, body: schema_definition.to_json) + end + + context 'when data matches the schema' do + it 'does not raise an error' do + expect { validator.validate(valid_context) }.not_to raise_error + end + end + + context 'when data does not match the schema' do + let(:invalid_data) { { 'invalid_field' => 'value' } } + let(:invalid_context) { SnowplowTracker::SelfDescribingJson.new(valid_schema_url, invalid_data) } + + it 'tracks the validation error' do + expect(Gitlab::ErrorTracking).to receive(:track_and_raise_for_dev_exception) + .with(an_instance_of(ArgumentError), hash_including(:schema_url, :validation_errors)) + + validator.validate(invalid_context) + end + end + end + + describe 'schema fetching from Iglu' do + let(:iglu_url) { "https://gitlab-org.gitlab.io/iglu/schemas/com.gitlab/gitlab_standard/jsonschema/1-1-7" } + let(:schema_definition) { { 'type' => 'object' } } + + before do + allow(Rails.env).to receive(:development?).and_return(true) + end + + context 'when schema fetch succeeds' do + before do + stub_request(:get, iglu_url) + .to_return(status: 200, body: schema_definition.to_json) + end + + it 'fetches and validates against the schema' do + expect { validator.validate(valid_context) }.not_to raise_error + end + + it 'caches the schema for subsequent requests', :use_clean_rails_memory_store_caching do + # First request - should hit the HTTP endpoint + validator.validate(valid_context) + expect(WebMock).to have_requested(:get, iglu_url).once + + # Second request - should use cached schema + validator.validate(valid_context) + expect(WebMock).to have_requested(:get, iglu_url).once # Still only once + end + end + + context 'when schema fetch fails' do + before do + stub_request(:get, iglu_url) + .to_return(status: 404, body: 'Not Found') + end + + it 'tracks the error' do + expect(Gitlab::ErrorTracking).to receive(:track_and_raise_for_dev_exception) + .with(an_instance_of(StandardError), hash_including(:schema_url, :http_url, :status_code)) + + validator.validate(valid_context) + end + end + + context 'when network error occurs' do + before do + stub_request(:get, iglu_url) + .to_raise(SocketError.new('Failed to connect')) + end + + it 'tracks the error' do + expect(Gitlab::ErrorTracking).to receive(:track_and_raise_for_dev_exception) + .with(an_instance_of(SocketError), hash_including(:schema_url)) + + validator.validate(valid_context) + end + end + end +end diff --git a/spec/lib/gitlab/tracking/destinations/snowplow_spec.rb b/spec/lib/gitlab/tracking/destinations/snowplow_spec.rb index 6f30607ad29658..3db46a3935c302 100644 --- a/spec/lib/gitlab/tracking/destinations/snowplow_spec.rb +++ b/spec/lib/gitlab/tracking/destinations/snowplow_spec.rb @@ -35,6 +35,42 @@ freeze_time { example.run } end + describe 'context validation' do + let(:context) { SnowplowTracker::SelfDescribingJson.new('iglu:com.gitlab/test/jsonschema/1-0-0', { key: 'value' }) } + let(:validator) { instance_double(Gitlab::Tracking::Destinations::SnowplowContextValidator) } + + before do + allow(Gitlab::Tracking::Destinations::SnowplowContextValidator).to receive(:new).and_return(validator) + allow(validator).to receive(:validate) + allow(subject).to receive(:tracker).and_return(tracker) + allow(tracker).to receive(:track_struct_event) + end + + context 'when in development environment' do + before do + allow(Rails.env).to receive(:development?).and_return(true) + end + + it 'validates the context' do + subject.event('category', 'action', context: context) + + expect(validator).to have_received(:validate).with(context) + end + end + + context 'when not in development environment' do + before do + allow(Rails.env).to receive(:development?).and_return(false) + end + + it 'does not validate the context' do + subject.event('category', 'action', context: context) + + expect(validator).not_to have_received(:validate) + end + end + end + context 'when in production environment' do before do allow(Rails.env).to receive_messages( -- GitLab From ea2cbcf43b2912168671a50014baedee70bc342c Mon Sep 17 00:00:00 2001 From: Niko Belokolodov Date: Wed, 15 Oct 2025 12:50:10 +1300 Subject: [PATCH 2/8] Apply 2 suggestion(s) to 2 file(s) Co-authored-by: GitLab Duo --- lib/gitlab/tracking/destinations/snowplow.rb | 6 +----- .../tracking/destinations/snowplow_context_validator.rb | 2 ++ 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/lib/gitlab/tracking/destinations/snowplow.rb b/lib/gitlab/tracking/destinations/snowplow.rb index 631610500e4357..a2d5a1b3f5a75f 100644 --- a/lib/gitlab/tracking/destinations/snowplow.rb +++ b/lib/gitlab/tracking/destinations/snowplow.rb @@ -23,14 +23,10 @@ def initialize(destination_configuration = DestinationConfiguration.snowplow_con Kernel.at_exit { tracker.flush(async: false) } end - def verify_context(context) - @context_validator.validate(context) - end - def event(category, action, label: nil, property: nil, value: nil, context: nil) return unless @event_eligibility_checker.eligible?(action) - @context_validator = SnowplowContextValidator.new.validate(context) if Rails.env.development? + SnowplowContextValidator.new.validate(context) if Rails.env.development? tracker.track_struct_event( category: category, diff --git a/lib/gitlab/tracking/destinations/snowplow_context_validator.rb b/lib/gitlab/tracking/destinations/snowplow_context_validator.rb index fba951cb8a271b..a80a693be86412 100644 --- a/lib/gitlab/tracking/destinations/snowplow_context_validator.rb +++ b/lib/gitlab/tracking/destinations/snowplow_context_validator.rb @@ -47,6 +47,8 @@ def log_validation_error(message, context) end def validate_against_schema(schema_url, data) + return unless valid_iglu_schema_url?(schema_url) + schema_definition = fetch_schema_from_iglu(schema_url) return unless schema_definition -- GitLab From 0c5578a758a30e7f41c3523ab4e61e1e4a7504bd Mon Sep 17 00:00:00 2001 From: Niko Belokolodov Date: Wed, 15 Oct 2025 17:07:03 +1300 Subject: [PATCH 3/8] Apply 1 suggestion(s) to 1 file(s) Co-authored-by: GitLab Duo --- lib/gitlab/tracking/destinations/snowplow_context_validator.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/gitlab/tracking/destinations/snowplow_context_validator.rb b/lib/gitlab/tracking/destinations/snowplow_context_validator.rb index a80a693be86412..1b97cefa154d9f 100644 --- a/lib/gitlab/tracking/destinations/snowplow_context_validator.rb +++ b/lib/gitlab/tracking/destinations/snowplow_context_validator.rb @@ -69,7 +69,7 @@ def fetch_schema_from_iglu(schema_url) def fetch_schema_from_iglu_without_cache(schema_url) url = "https://gitlab-org.gitlab.io/iglu/schemas/#{schema_url.delete_prefix('iglu:')}" - response = Gitlab::HTTP.get(url, allow_local_requests: true) + response = Gitlab::HTTP.get(url, allow_local_requests: true, timeout: 5) if response.success? Gitlab::Json.parse(response.body) -- GitLab From ca84ca842d373cb09df464a6bfbe8e315c63f690 Mon Sep 17 00:00:00 2001 From: Niko Belokolodov Date: Thu, 16 Oct 2025 16:14:20 +1300 Subject: [PATCH 4/8] Fix validator implementation --- .../event_forward/event_forward_controller.rb | 5 +++ lib/gitlab/tracking/destinations/snowplow.rb | 2 -- .../snowplow_context_validator.rb | 34 ++----------------- 3 files changed, 8 insertions(+), 33 deletions(-) diff --git a/app/controllers/event_forward/event_forward_controller.rb b/app/controllers/event_forward/event_forward_controller.rb index 08c07a3a0352d0..45cdcacc6ba9ac 100644 --- a/app/controllers/event_forward/event_forward_controller.rb +++ b/app/controllers/event_forward/event_forward_controller.rb @@ -24,6 +24,11 @@ def process_events end events_to_forward.each do |event| + if Rails.env.development? && event['cx'] + context = Gitlab::Json.parse(Base64.decode64(event['cx'])) + Gitlab::Tracking::Destinations::SnowplowContextValidator.new.validate(context['data']) + end + update_app_id(event) tracker.emit_event_payload(event) end diff --git a/lib/gitlab/tracking/destinations/snowplow.rb b/lib/gitlab/tracking/destinations/snowplow.rb index a2d5a1b3f5a75f..6795e8c14df84d 100644 --- a/lib/gitlab/tracking/destinations/snowplow.rb +++ b/lib/gitlab/tracking/destinations/snowplow.rb @@ -26,8 +26,6 @@ def initialize(destination_configuration = DestinationConfiguration.snowplow_con def event(category, action, label: nil, property: nil, value: nil, context: nil) return unless @event_eligibility_checker.eligible?(action) - SnowplowContextValidator.new.validate(context) if Rails.env.development? - tracker.track_struct_event( category: category, action: action, diff --git a/lib/gitlab/tracking/destinations/snowplow_context_validator.rb b/lib/gitlab/tracking/destinations/snowplow_context_validator.rb index 1b97cefa154d9f..1539ff117f406a 100644 --- a/lib/gitlab/tracking/destinations/snowplow_context_validator.rb +++ b/lib/gitlab/tracking/destinations/snowplow_context_validator.rb @@ -4,8 +4,6 @@ module Gitlab module Tracking module Destinations class SnowplowContextValidator - IGLU_SCHEMA_URL_REGEX = %r{\Aiglu:[a-zA-Z0-9._-]+/[a-zA-Z0-9._-]+/jsonschema/\d+-\d+-\d+\z} - def validate(context) return unless context @@ -14,40 +12,14 @@ def validate(context) return end - unless context.is_a?(SnowplowTracker::SelfDescribingJson) - log_validation_error("Context must be a SnowplowTracker::SelfDescribingJson object", context) - return - end - - json = context.to_json - validate_against_schema(json[:schema], json[:data]) if schema_validation_enabled? + json = context.with_indifferent_access + validate_against_schema(json[:schema], json[:data]) end private - def schema_validation_enabled? - Rails.env.test? || Rails.env.development? - end - - def valid_iglu_schema_url?(schema_url) - return false unless schema_url.is_a?(String) - - # Example: iglu:com.gitlab/gitlab_standard/jsonschema/1-1-7 - schema_url.match?(IGLU_SCHEMA_URL_REGEX) - end - - def log_validation_error(message, context) - exception = ArgumentError.new("Snowplow context validation error: #{message}") - - Gitlab::ErrorTracking.track_and_raise_for_dev_exception( - exception, - context_class: context.class.name, - context_inspect: context.inspect - ) - end - def validate_against_schema(schema_url, data) - return unless valid_iglu_schema_url?(schema_url) + return unless schema_url.start_with?('iglu:com.gitlab') # No need to verify payloads from standard plugins schema_definition = fetch_schema_from_iglu(schema_url) return unless schema_definition -- GitLab From c27c30eebcdab00ab487d83f498907037d2e3ef9 Mon Sep 17 00:00:00 2001 From: Niko Belokolodov Date: Thu, 16 Oct 2025 16:24:00 +1300 Subject: [PATCH 5/8] Update the specs --- .../snowplow_context_validator_spec.rb | 20 ++------ .../tracking/destinations/snowplow_spec.rb | 36 ------------- .../event_forward_controller_spec.rb | 50 +++++++++++++++++++ 3 files changed, 53 insertions(+), 53 deletions(-) diff --git a/spec/lib/gitlab/tracking/destinations/snowplow_context_validator_spec.rb b/spec/lib/gitlab/tracking/destinations/snowplow_context_validator_spec.rb index f270c3add053f7..47d61cd66fcf74 100644 --- a/spec/lib/gitlab/tracking/destinations/snowplow_context_validator_spec.rb +++ b/spec/lib/gitlab/tracking/destinations/snowplow_context_validator_spec.rb @@ -7,7 +7,7 @@ let(:valid_schema_url) { 'iglu:com.gitlab/gitlab_standard/jsonschema/1-1-7' } let(:valid_data) { { 'environment' => 'test', 'source' => 'gitlab-rails' } } - let(:valid_context) { SnowplowTracker::SelfDescribingJson.new(valid_schema_url, valid_data) } + let(:valid_context) { { schema: valid_schema_url, data: valid_data } } describe '#validate' do let(:schema_definition) do @@ -25,7 +25,7 @@ .to_return(status: 200, body: schema_definition.to_json) end - context 'with a valid SelfDescribingJson context' do + context 'with a valid hash context' do it 'does not raise an error' do expect { validator.validate(valid_context) }.not_to raise_error end @@ -44,15 +44,6 @@ expect { validator.validate(nil) }.not_to raise_error end end - - context 'with invalid context type' do - it 'tracks the error in development' do - expect(Gitlab::ErrorTracking).to receive(:track_and_raise_for_dev_exception) - .with(an_instance_of(ArgumentError), hash_including(:context_class)) - - validator.validate("invalid") - end - end end describe 'schema validation', :freeze_time do @@ -68,7 +59,6 @@ end before do - allow(Rails.env).to receive(:development?).and_return(true) stub_request(:get, "https://gitlab-org.gitlab.io/iglu/schemas/#{valid_schema_url.delete_prefix('iglu:')}") .to_return(status: 200, body: schema_definition.to_json) end @@ -81,7 +71,7 @@ context 'when data does not match the schema' do let(:invalid_data) { { 'invalid_field' => 'value' } } - let(:invalid_context) { SnowplowTracker::SelfDescribingJson.new(valid_schema_url, invalid_data) } + let(:invalid_context) { { schema: valid_schema_url, data: invalid_data } } it 'tracks the validation error' do expect(Gitlab::ErrorTracking).to receive(:track_and_raise_for_dev_exception) @@ -96,10 +86,6 @@ let(:iglu_url) { "https://gitlab-org.gitlab.io/iglu/schemas/com.gitlab/gitlab_standard/jsonschema/1-1-7" } let(:schema_definition) { { 'type' => 'object' } } - before do - allow(Rails.env).to receive(:development?).and_return(true) - end - context 'when schema fetch succeeds' do before do stub_request(:get, iglu_url) diff --git a/spec/lib/gitlab/tracking/destinations/snowplow_spec.rb b/spec/lib/gitlab/tracking/destinations/snowplow_spec.rb index 3db46a3935c302..6f30607ad29658 100644 --- a/spec/lib/gitlab/tracking/destinations/snowplow_spec.rb +++ b/spec/lib/gitlab/tracking/destinations/snowplow_spec.rb @@ -35,42 +35,6 @@ freeze_time { example.run } end - describe 'context validation' do - let(:context) { SnowplowTracker::SelfDescribingJson.new('iglu:com.gitlab/test/jsonschema/1-0-0', { key: 'value' }) } - let(:validator) { instance_double(Gitlab::Tracking::Destinations::SnowplowContextValidator) } - - before do - allow(Gitlab::Tracking::Destinations::SnowplowContextValidator).to receive(:new).and_return(validator) - allow(validator).to receive(:validate) - allow(subject).to receive(:tracker).and_return(tracker) - allow(tracker).to receive(:track_struct_event) - end - - context 'when in development environment' do - before do - allow(Rails.env).to receive(:development?).and_return(true) - end - - it 'validates the context' do - subject.event('category', 'action', context: context) - - expect(validator).to have_received(:validate).with(context) - end - end - - context 'when not in development environment' do - before do - allow(Rails.env).to receive(:development?).and_return(false) - end - - it 'does not validate the context' do - subject.event('category', 'action', context: context) - - expect(validator).not_to have_received(:validate) - end - end - end - context 'when in production environment' do before do allow(Rails.env).to receive_messages( diff --git a/spec/requests/event_forward/event_forward_controller_spec.rb b/spec/requests/event_forward/event_forward_controller_spec.rb index 186879d46f5918..ddb84bf8715a5f 100644 --- a/spec/requests/event_forward/event_forward_controller_spec.rb +++ b/spec/requests/event_forward/event_forward_controller_spec.rb @@ -142,5 +142,55 @@ request end end + + describe 'context validation' do + let(:validator) { instance_double(Gitlab::Tracking::Destinations::SnowplowContextValidator) } + let(:context_data) { [{ 'schema' => 'iglu:com.gitlab/test/jsonschema/1-0-0', 'data' => { 'key' => 'value' } }] } + let(:encoded_context) { Base64.encode64({ 'data' => context_data }.to_json) } + let(:event_with_context) { { 'se_ac' => 'event_1', 'aid' => 'app_id_1', 'cx' => encoded_context } } + + before do + allow(Gitlab::Tracking::Destinations::SnowplowContextValidator).to receive(:new).and_return(validator) + allow(validator).to receive(:validate) + payload['data'] = [event_with_context] + end + + context 'when in development environment' do + before do + allow(Rails.env).to receive(:development?).and_return(true) + end + + it 'validates the context' do + request + + expect(validator).to have_received(:validate).with(context_data) + end + end + + context 'when not in development environment' do + before do + allow(Rails.env).to receive(:development?).and_return(false) + end + + it 'does not validate the context' do + request + + expect(validator).not_to have_received(:validate) + end + end + + context 'when event does not have cx field' do + before do + allow(Rails.env).to receive(:development?).and_return(true) + payload['data'] = [event_1] + end + + it 'does not validate the context' do + request + + expect(validator).not_to have_received(:validate) + end + end + end end end -- GitLab From 2dcce91e0c8260a7cf8d55208561b76bf5e0942f Mon Sep 17 00:00:00 2001 From: Niko Belokolodov Date: Tue, 21 Oct 2025 12:51:17 +1300 Subject: [PATCH 6/8] Address review comments --- .../event_forward/event_forward_controller.rb | 2 +- .../snowplow_context_validator.rb | 52 ++++++------------- .../snowplow_context_validator_spec.rb | 45 +++++++--------- .../event_forward_controller_spec.rb | 8 +-- 4 files changed, 41 insertions(+), 66 deletions(-) diff --git a/app/controllers/event_forward/event_forward_controller.rb b/app/controllers/event_forward/event_forward_controller.rb index 45cdcacc6ba9ac..ee86c1a76af70b 100644 --- a/app/controllers/event_forward/event_forward_controller.rb +++ b/app/controllers/event_forward/event_forward_controller.rb @@ -26,7 +26,7 @@ def process_events events_to_forward.each do |event| if Rails.env.development? && event['cx'] context = Gitlab::Json.parse(Base64.decode64(event['cx'])) - Gitlab::Tracking::Destinations::SnowplowContextValidator.new.validate(context['data']) + Gitlab::Tracking::Destinations::SnowplowContextValidator.new.validate!(context['data']) end update_app_id(event) diff --git a/lib/gitlab/tracking/destinations/snowplow_context_validator.rb b/lib/gitlab/tracking/destinations/snowplow_context_validator.rb index 1539ff117f406a..bd4ceee1fa0cad 100644 --- a/lib/gitlab/tracking/destinations/snowplow_context_validator.rb +++ b/lib/gitlab/tracking/destinations/snowplow_context_validator.rb @@ -4,16 +4,13 @@ module Gitlab module Tracking module Destinations class SnowplowContextValidator - def validate(context) + def validate!(context) return unless context - if context.is_a?(Array) - context.each { |item| validate(item) } - return + Array.wrap(context).each do |item| + json = item.with_indifferent_access + validate_against_schema(json[:schema], json[:data]) end - - json = context.with_indifferent_access - validate_against_schema(json[:schema], json[:data]) end private @@ -27,7 +24,16 @@ def validate_against_schema(schema_url, data) validator = JSONSchemer.schema(schema_definition) errors = validator.validate(data).to_a - log_schema_validation_errors(schema_url, data, errors) if errors.any? + return unless errors.any? + + error_messages = errors.map { |error| JSONSchemer::Errors.pretty(error) } + + Gitlab::ErrorTracking.track_and_raise_for_dev_exception( + ArgumentError.new("Snowplow context data does not match schema"), + schema_url: schema_url, + data: data, + validation_errors: error_messages + ) end def fetch_schema_from_iglu(schema_url) @@ -46,35 +52,11 @@ def fetch_schema_from_iglu_without_cache(schema_url) if response.success? Gitlab::Json.parse(response.body) else - exception = StandardError.new("Failed to fetch Snowplow schema from Iglu registry") - - Gitlab::ErrorTracking.track_and_raise_for_dev_exception( - exception, - schema_url: schema_url, - http_url: url, - status_code: response.code - ) + Gitlab::AppJsonLogger.warn(message: "Failed to fetch Snowplow schema from Iglu registry", + status_code: response.code, + schema_url: schema_url) nil end - rescue StandardError => e - Gitlab::ErrorTracking.track_and_raise_for_dev_exception( - e, - schema_url: schema_url, - http_url: url - ) - nil - end - - def log_schema_validation_errors(schema_url, data, errors) - error_messages = errors.map { |error| JSONSchemer::Errors.pretty(error) } - exception = ArgumentError.new("Snowplow context data does not match schema: #{error_messages.join(', ')}") - - Gitlab::ErrorTracking.track_and_raise_for_dev_exception( - exception, - schema_url: schema_url, - data: data, - validation_errors: error_messages - ) end end end diff --git a/spec/lib/gitlab/tracking/destinations/snowplow_context_validator_spec.rb b/spec/lib/gitlab/tracking/destinations/snowplow_context_validator_spec.rb index 47d61cd66fcf74..147d9be48669b2 100644 --- a/spec/lib/gitlab/tracking/destinations/snowplow_context_validator_spec.rb +++ b/spec/lib/gitlab/tracking/destinations/snowplow_context_validator_spec.rb @@ -9,7 +9,7 @@ let(:valid_data) { { 'environment' => 'test', 'source' => 'gitlab-rails' } } let(:valid_context) { { schema: valid_schema_url, data: valid_data } } - describe '#validate' do + describe '#validate!' do let(:schema_definition) do { 'type' => 'object', @@ -27,7 +27,7 @@ context 'with a valid hash context' do it 'does not raise an error' do - expect { validator.validate(valid_context) }.not_to raise_error + expect { validator.validate!(valid_context) }.not_to raise_error end end @@ -35,13 +35,13 @@ let(:contexts) { [valid_context, valid_context] } it 'validates each context in the array' do - expect { validator.validate(contexts) }.not_to raise_error + expect { validator.validate!(contexts) }.not_to raise_error end end context 'with nil context' do it 'returns early without error' do - expect { validator.validate(nil) }.not_to raise_error + expect { validator.validate!(nil) }.not_to raise_error end end end @@ -65,7 +65,7 @@ context 'when data matches the schema' do it 'does not raise an error' do - expect { validator.validate(valid_context) }.not_to raise_error + expect { validator.validate!(valid_context) }.not_to raise_error end end @@ -77,7 +77,7 @@ expect(Gitlab::ErrorTracking).to receive(:track_and_raise_for_dev_exception) .with(an_instance_of(ArgumentError), hash_including(:schema_url, :validation_errors)) - validator.validate(invalid_context) + validator.validate!(invalid_context) end end end @@ -93,16 +93,16 @@ end it 'fetches and validates against the schema' do - expect { validator.validate(valid_context) }.not_to raise_error + expect { validator.validate!(valid_context) }.not_to raise_error end it 'caches the schema for subsequent requests', :use_clean_rails_memory_store_caching do # First request - should hit the HTTP endpoint - validator.validate(valid_context) + validator.validate!(valid_context) expect(WebMock).to have_requested(:get, iglu_url).once # Second request - should use cached schema - validator.validate(valid_context) + validator.validate!(valid_context) expect(WebMock).to have_requested(:get, iglu_url).once # Still only once end end @@ -110,28 +110,21 @@ context 'when schema fetch fails' do before do stub_request(:get, iglu_url) - .to_return(status: 404, body: 'Not Found') + .to_return(status: 500) end - it 'tracks the error' do - expect(Gitlab::ErrorTracking).to receive(:track_and_raise_for_dev_exception) - .with(an_instance_of(StandardError), hash_including(:schema_url, :http_url, :status_code)) - - validator.validate(valid_context) - end - end - - context 'when network error occurs' do - before do - stub_request(:get, iglu_url) - .to_raise(SocketError.new('Failed to connect')) + it 'does not raise an error' do + expect { validator.validate!(valid_context) }.not_to raise_error end - it 'tracks the error' do - expect(Gitlab::ErrorTracking).to receive(:track_and_raise_for_dev_exception) - .with(an_instance_of(SocketError), hash_including(:schema_url)) + it 'logs the error' do + expect(Gitlab::AppJsonLogger).to receive(:warn).with( + message: 'Failed to fetch Snowplow schema from Iglu registry', + status_code: 500, + schema_url: valid_schema_url + ) - validator.validate(valid_context) + validator.validate!(valid_context) end end end diff --git a/spec/requests/event_forward/event_forward_controller_spec.rb b/spec/requests/event_forward/event_forward_controller_spec.rb index ddb84bf8715a5f..0866bb22e02401 100644 --- a/spec/requests/event_forward/event_forward_controller_spec.rb +++ b/spec/requests/event_forward/event_forward_controller_spec.rb @@ -151,7 +151,7 @@ before do allow(Gitlab::Tracking::Destinations::SnowplowContextValidator).to receive(:new).and_return(validator) - allow(validator).to receive(:validate) + allow(validator).to receive(:validate!) payload['data'] = [event_with_context] end @@ -163,7 +163,7 @@ it 'validates the context' do request - expect(validator).to have_received(:validate).with(context_data) + expect(validator).to have_received(:validate!).with(context_data) end end @@ -175,7 +175,7 @@ it 'does not validate the context' do request - expect(validator).not_to have_received(:validate) + expect(validator).not_to have_received(:validate!) end end @@ -188,7 +188,7 @@ it 'does not validate the context' do request - expect(validator).not_to have_received(:validate) + expect(validator).not_to have_received(:validate!) end end end -- GitLab From 9e03a4bd18e4e3492b53789745ca5375c424fdb2 Mon Sep 17 00:00:00 2001 From: Niko Belokolodov Date: Wed, 22 Oct 2025 16:45:07 +1300 Subject: [PATCH 7/8] Remove schema url from Schemer retrieval --- .../tracking/destinations/snowplow_context_validator.rb | 6 ++++-- .../destinations/snowplow_context_validator_spec.rb | 9 +++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/lib/gitlab/tracking/destinations/snowplow_context_validator.rb b/lib/gitlab/tracking/destinations/snowplow_context_validator.rb index bd4ceee1fa0cad..5377b5fa6330c6 100644 --- a/lib/gitlab/tracking/destinations/snowplow_context_validator.rb +++ b/lib/gitlab/tracking/destinations/snowplow_context_validator.rb @@ -21,6 +21,7 @@ def validate_against_schema(schema_url, data) schema_definition = fetch_schema_from_iglu(schema_url) return unless schema_definition + File.write('schema.json', schema_definition) validator = JSONSchemer.schema(schema_definition) errors = validator.validate(data).to_a @@ -28,8 +29,9 @@ def validate_against_schema(schema_url, data) error_messages = errors.map { |error| JSONSchemer::Errors.pretty(error) } + File.write('error.txt', error_messages) Gitlab::ErrorTracking.track_and_raise_for_dev_exception( - ArgumentError.new("Snowplow context data does not match schema"), + ArgumentError.new("Snowplow context data does not match schema: #{error_messages.join('\n}')}"), schema_url: schema_url, data: data, validation_errors: error_messages @@ -50,7 +52,7 @@ def fetch_schema_from_iglu_without_cache(schema_url) response = Gitlab::HTTP.get(url, allow_local_requests: true, timeout: 5) if response.success? - Gitlab::Json.parse(response.body) + Gitlab::Json.parse(response.body).except('$schema') # we dont need to resolve schema with JSONSchemer else Gitlab::AppJsonLogger.warn(message: "Failed to fetch Snowplow schema from Iglu registry", status_code: response.code, diff --git a/spec/lib/gitlab/tracking/destinations/snowplow_context_validator_spec.rb b/spec/lib/gitlab/tracking/destinations/snowplow_context_validator_spec.rb index 147d9be48669b2..1da46f909967df 100644 --- a/spec/lib/gitlab/tracking/destinations/snowplow_context_validator_spec.rb +++ b/spec/lib/gitlab/tracking/destinations/snowplow_context_validator_spec.rb @@ -105,6 +105,15 @@ validator.validate!(valid_context) expect(WebMock).to have_requested(:get, iglu_url).once # Still only once end + + it 'excludes $schema field from the schema definition' do + schema_with_meta = schema_definition.merge('$schema' => 'http://json-schema.org/draft-07/schema#') + + stub_request(:get, iglu_url) + .to_return(status: 200, body: schema_with_meta.to_json) + + expect { validator.validate!(valid_context) }.not_to raise_error + end end context 'when schema fetch fails' do -- GitLab From 6b003ae5fee62858151c0f7f8fdb1bbab4ec6865 Mon Sep 17 00:00:00 2001 From: Niko Belokolodov Date: Wed, 22 Oct 2025 16:57:38 +1300 Subject: [PATCH 8/8] Apply 1 suggestion(s) to 1 file(s) Co-authored-by: Jonas Larsen --- .../tracking/destinations/snowplow_context_validator.rb | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/lib/gitlab/tracking/destinations/snowplow_context_validator.rb b/lib/gitlab/tracking/destinations/snowplow_context_validator.rb index 5377b5fa6330c6..f8856e14f4f919 100644 --- a/lib/gitlab/tracking/destinations/snowplow_context_validator.rb +++ b/lib/gitlab/tracking/destinations/snowplow_context_validator.rb @@ -5,8 +5,6 @@ module Tracking module Destinations class SnowplowContextValidator def validate!(context) - return unless context - Array.wrap(context).each do |item| json = item.with_indifferent_access validate_against_schema(json[:schema], json[:data]) @@ -21,7 +19,6 @@ def validate_against_schema(schema_url, data) schema_definition = fetch_schema_from_iglu(schema_url) return unless schema_definition - File.write('schema.json', schema_definition) validator = JSONSchemer.schema(schema_definition) errors = validator.validate(data).to_a @@ -29,9 +26,8 @@ def validate_against_schema(schema_url, data) error_messages = errors.map { |error| JSONSchemer::Errors.pretty(error) } - File.write('error.txt', error_messages) Gitlab::ErrorTracking.track_and_raise_for_dev_exception( - ArgumentError.new("Snowplow context data does not match schema: #{error_messages.join('\n}')}"), + ArgumentError.new("Snowplow context data does not match schema: #{error_messages.join(' ')}"), schema_url: schema_url, data: data, validation_errors: error_messages -- GitLab