diff --git a/app/controllers/event_forward/event_forward_controller.rb b/app/controllers/event_forward/event_forward_controller.rb index 08c07a3a0352d04203ee34cee9da60395e8d5734..ee86c1a76af70bbdd7ed3c53b9517859869a3a1f 100644 --- a/app/controllers/event_forward/event_forward_controller.rb +++ b/app/controllers/event_forward/event_forward_controller.rb @@ -24,6 +24,11 @@ def process_events end events_to_forward.each do |event| + if Rails.env.development? && event['cx'] + context = Gitlab::Json.parse(Base64.decode64(event['cx'])) + Gitlab::Tracking::Destinations::SnowplowContextValidator.new.validate!(context['data']) + end + update_app_id(event) tracker.emit_event_payload(event) end diff --git a/lib/gitlab/tracking/destinations/snowplow_context_validator.rb b/lib/gitlab/tracking/destinations/snowplow_context_validator.rb new file mode 100644 index 0000000000000000000000000000000000000000..f8856e14f4f9197b037ebc4ab075aff9e1bcdb9d --- /dev/null +++ b/lib/gitlab/tracking/destinations/snowplow_context_validator.rb @@ -0,0 +1,62 @@ +# frozen_string_literal: true + +module Gitlab + module Tracking + module Destinations + class SnowplowContextValidator + def validate!(context) + Array.wrap(context).each do |item| + json = item.with_indifferent_access + validate_against_schema(json[:schema], json[:data]) + end + end + + private + + def validate_against_schema(schema_url, data) + return unless schema_url.start_with?('iglu:com.gitlab') # No need to verify payloads from standard plugins + + schema_definition = fetch_schema_from_iglu(schema_url) + return unless schema_definition + + validator = JSONSchemer.schema(schema_definition) + errors = validator.validate(data).to_a + + return unless errors.any? + + error_messages = errors.map { |error| JSONSchemer::Errors.pretty(error) } + + Gitlab::ErrorTracking.track_and_raise_for_dev_exception( + ArgumentError.new("Snowplow context data does not match schema: #{error_messages.join(' ')}"), + schema_url: schema_url, + data: data, + validation_errors: error_messages + ) + end + + def fetch_schema_from_iglu(schema_url) + cache_key = "snowplow:schema:#{schema_url}" + + Rails.cache.fetch(cache_key, expires_in: 1.hour, skip_nil: true) do + fetch_schema_from_iglu_without_cache(schema_url) + end + end + + def fetch_schema_from_iglu_without_cache(schema_url) + url = "https://gitlab-org.gitlab.io/iglu/schemas/#{schema_url.delete_prefix('iglu:')}" + + response = Gitlab::HTTP.get(url, allow_local_requests: true, timeout: 5) + + if response.success? + Gitlab::Json.parse(response.body).except('$schema') # we dont need to resolve schema with JSONSchemer + else + Gitlab::AppJsonLogger.warn(message: "Failed to fetch Snowplow schema from Iglu registry", + status_code: response.code, + schema_url: schema_url) + nil + end + end + end + end + end +end diff --git a/spec/lib/gitlab/tracking/destinations/snowplow_context_validator_spec.rb b/spec/lib/gitlab/tracking/destinations/snowplow_context_validator_spec.rb new file mode 100644 index 0000000000000000000000000000000000000000..1da46f909967df038feab9c3bc7186fef9269cfb --- /dev/null +++ b/spec/lib/gitlab/tracking/destinations/snowplow_context_validator_spec.rb @@ -0,0 +1,140 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe Gitlab::Tracking::Destinations::SnowplowContextValidator, feature_category: :application_instrumentation do + subject(:validator) { described_class.new } + + let(:valid_schema_url) { 'iglu:com.gitlab/gitlab_standard/jsonschema/1-1-7' } + let(:valid_data) { { 'environment' => 'test', 'source' => 'gitlab-rails' } } + let(:valid_context) { { schema: valid_schema_url, data: valid_data } } + + describe '#validate!' do + let(:schema_definition) do + { + 'type' => 'object', + 'properties' => { + 'environment' => { 'type' => 'string' }, + 'source' => { 'type' => 'string' } + } + } + end + + before do + stub_request(:get, "https://gitlab-org.gitlab.io/iglu/schemas/#{valid_schema_url.delete_prefix('iglu:')}") + .to_return(status: 200, body: schema_definition.to_json) + end + + context 'with a valid hash context' do + it 'does not raise an error' do + expect { validator.validate!(valid_context) }.not_to raise_error + end + end + + context 'with an array of contexts' do + let(:contexts) { [valid_context, valid_context] } + + it 'validates each context in the array' do + expect { validator.validate!(contexts) }.not_to raise_error + end + end + + context 'with nil context' do + it 'returns early without error' do + expect { validator.validate!(nil) }.not_to raise_error + end + end + end + + describe 'schema validation', :freeze_time do + let(:schema_definition) do + { + 'type' => 'object', + 'properties' => { + 'environment' => { 'type' => 'string' }, + 'source' => { 'type' => 'string' } + }, + 'required' => %w[environment source] + } + end + + before do + stub_request(:get, "https://gitlab-org.gitlab.io/iglu/schemas/#{valid_schema_url.delete_prefix('iglu:')}") + .to_return(status: 200, body: schema_definition.to_json) + end + + context 'when data matches the schema' do + it 'does not raise an error' do + expect { validator.validate!(valid_context) }.not_to raise_error + end + end + + context 'when data does not match the schema' do + let(:invalid_data) { { 'invalid_field' => 'value' } } + let(:invalid_context) { { schema: valid_schema_url, data: invalid_data } } + + it 'tracks the validation error' do + expect(Gitlab::ErrorTracking).to receive(:track_and_raise_for_dev_exception) + .with(an_instance_of(ArgumentError), hash_including(:schema_url, :validation_errors)) + + validator.validate!(invalid_context) + end + end + end + + describe 'schema fetching from Iglu' do + let(:iglu_url) { "https://gitlab-org.gitlab.io/iglu/schemas/com.gitlab/gitlab_standard/jsonschema/1-1-7" } + let(:schema_definition) { { 'type' => 'object' } } + + context 'when schema fetch succeeds' do + before do + stub_request(:get, iglu_url) + .to_return(status: 200, body: schema_definition.to_json) + end + + it 'fetches and validates against the schema' do + expect { validator.validate!(valid_context) }.not_to raise_error + end + + it 'caches the schema for subsequent requests', :use_clean_rails_memory_store_caching do + # First request - should hit the HTTP endpoint + validator.validate!(valid_context) + expect(WebMock).to have_requested(:get, iglu_url).once + + # Second request - should use cached schema + validator.validate!(valid_context) + expect(WebMock).to have_requested(:get, iglu_url).once # Still only once + end + + it 'excludes $schema field from the schema definition' do + schema_with_meta = schema_definition.merge('$schema' => 'http://json-schema.org/draft-07/schema#') + + stub_request(:get, iglu_url) + .to_return(status: 200, body: schema_with_meta.to_json) + + expect { validator.validate!(valid_context) }.not_to raise_error + end + end + + context 'when schema fetch fails' do + before do + stub_request(:get, iglu_url) + .to_return(status: 500) + end + + it 'does not raise an error' do + expect { validator.validate!(valid_context) }.not_to raise_error + end + + it 'logs the error' do + expect(Gitlab::AppJsonLogger).to receive(:warn).with( + message: 'Failed to fetch Snowplow schema from Iglu registry', + status_code: 500, + schema_url: valid_schema_url + ) + + validator.validate!(valid_context) + end + end + end +end diff --git a/spec/requests/event_forward/event_forward_controller_spec.rb b/spec/requests/event_forward/event_forward_controller_spec.rb index 186879d46f59186c198a0b15040d5013851225a5..0866bb22e02401145a31d7328da0eb20bd38c86d 100644 --- a/spec/requests/event_forward/event_forward_controller_spec.rb +++ b/spec/requests/event_forward/event_forward_controller_spec.rb @@ -142,5 +142,55 @@ request end end + + describe 'context validation' do + let(:validator) { instance_double(Gitlab::Tracking::Destinations::SnowplowContextValidator) } + let(:context_data) { [{ 'schema' => 'iglu:com.gitlab/test/jsonschema/1-0-0', 'data' => { 'key' => 'value' } }] } + let(:encoded_context) { Base64.encode64({ 'data' => context_data }.to_json) } + let(:event_with_context) { { 'se_ac' => 'event_1', 'aid' => 'app_id_1', 'cx' => encoded_context } } + + before do + allow(Gitlab::Tracking::Destinations::SnowplowContextValidator).to receive(:new).and_return(validator) + allow(validator).to receive(:validate!) + payload['data'] = [event_with_context] + end + + context 'when in development environment' do + before do + allow(Rails.env).to receive(:development?).and_return(true) + end + + it 'validates the context' do + request + + expect(validator).to have_received(:validate!).with(context_data) + end + end + + context 'when not in development environment' do + before do + allow(Rails.env).to receive(:development?).and_return(false) + end + + it 'does not validate the context' do + request + + expect(validator).not_to have_received(:validate!) + end + end + + context 'when event does not have cx field' do + before do + allow(Rails.env).to receive(:development?).and_return(true) + payload['data'] = [event_1] + end + + it 'does not validate the context' do + request + + expect(validator).not_to have_received(:validate!) + end + end + end end end