From abf7d688a3ace0c254df75d42e5f3855a5d530be Mon Sep 17 00:00:00 2001 From: Asherah Connor Date: Fri, 3 Oct 2025 12:11:17 +1000 Subject: [PATCH 1/3] Ensure redacted links do not re-escape their inner HTML https://gitlab.com/gitlab-org/security/gitlab/-/merge_requests/2275 escaped all data-original contents before using it in ReferenceRedactor, but this leads to legitimate original HTML content being escaped again; see https://gitlab.com/gitlab-org/gitlab/-/issues/572761. The actual root cause is LabelReferenceFilter#references_in and other similar overrides called by AbstractReferenceFilter#object_link_filter; it matches on de-escaped text, and then re-escapes it when returning to its caller --- but it didn't re-escape when yielding. The yielded match eventually becomes data-original, causing HTML content such as `<script>` to be yielded as ` => nil ``` In other words, despite the different look of the two definitions, they both represent their payload tags in their raw form, unescaped form, because one layer of entities is unescaped when an HTML attribute is parsed. This can produce misleading results. In the spec, note that the raw HTML (`original_content`) was: (a) inserted into the attribute (resulting in one level of entity unescaping), and \ (b) compared for equality at the end against `doc.to_html`. If given unescaped entities like ``, the spec effectively asserts that the data-original (logical) content was inserted directly into the page's HTML. This was the original state of the spec. On the other hand, if given enscaped entities (like `<script>...`), the spec effectively asserts that the data-original (logical) content is _escaped_ into the page's HTML, because we are asserting those entity escapes are present in the result from `#to_html`. We don't actually want data-original's content to be escaped into the page's HTML --- we want data-original's content to faithfully represent the original content! I've have modified the spec to instead construct the attribute via Nokogiri's DOM, meaning we can't be tripped up by this. There's now a pair of specs that assert that, whatever the data-original content, it finds its way correctly to the output There's also an additional spec which asserts the root vulnerability remains closed in LabelReferenceFilter. Changelog: changed --- .../string_identifier_argument.yml | 2 +- .../references/iteration_reference_filter.rb | 24 +-- .../iterations_cadence_reference_filter.rb | 23 +-- .../references/epic_reference_filter.rb | 10 +- .../references/label_reference_filter.rb | 2 +- .../vulnerability_reference_filter.rb | 18 +- .../references/epic_reference_filter_spec.rb | 17 +- .../iteration_reference_filter_spec.rb | 21 +- ...terations_cadence_reference_filter_spec.rb | 19 +- .../vulnerability_reference_filters_spec.rb | 7 +- lib/banzai/filter/playable_link_filter.rb | 6 +- .../references/abstract_reference_filter.rb | 174 +++++++--------- .../commit_range_reference_filter.rb | 4 +- .../references/commit_reference_filter.rb | 3 +- .../references/design_reference_filter.rb | 2 +- .../external_issue_reference_filter.rb | 39 ++-- .../references/issue_reference_filter.rb | 2 +- .../references/label_reference_filter.rb | 19 +- .../merge_request_reference_filter.rb | 2 +- .../references/milestone_reference_filter.rb | 27 +-- .../references/project_reference_filter.rb | 31 +-- .../filter/references/reference_cache.rb | 6 +- .../filter/references/reference_filter.rb | 190 +++++++++++++----- .../references/user_reference_filter.rb | 71 +++---- lib/banzai/reference_redactor.rb | 1 - lib/gitlab/untrusted_regexp.rb | 2 + lib/gitlab/utils/gsub.rb | 2 + .../abstract_reference_filter_spec.rb | 31 ++- .../commit_range_reference_filter_spec.rb | 5 + .../commit_reference_filter_spec.rb | 5 + .../external_issue_reference_filter_spec.rb | 26 +++ .../references/label_reference_filter_spec.rb | 18 ++ .../milestone_reference_filter_spec.rb | 22 +- .../project_reference_filter_spec.rb | 4 + .../references/reference_filter_spec.rb | 51 ++--- .../references/user_reference_filter_spec.rb | 4 + .../lib/banzai/pipeline/full_pipeline_spec.rb | 29 ++- spec/lib/banzai/reference_redactor_spec.rb | 16 +- spec/mailers/emails/service_desk_spec.rb | 5 +- .../timeline_events_spec.rb | 3 +- spec/support/matchers/eq_html.rb | 82 ++++++++ .../reference_filter_shared_examples.rb | 89 -------- .../banzai/filters/emoji_shared_examples.rb | 0 .../filters/filter_timeout_shared_examples.rb | 0 .../reference_filter_shared_examples.rb | 154 ++++++++++++++ .../support_specs/matchers/match_html_spec.rb | 25 +++ 46 files changed, 826 insertions(+), 467 deletions(-) create mode 100644 spec/support/matchers/eq_html.rb delete mode 100644 spec/support/shared_examples/banzai/filters/reference_filter_shared_examples.rb rename spec/support/shared_examples/{ => lib}/banzai/filters/emoji_shared_examples.rb (100%) rename spec/support/shared_examples/{ => lib}/banzai/filters/filter_timeout_shared_examples.rb (100%) create mode 100644 spec/support_specs/matchers/match_html_spec.rb diff --git a/.rubocop_todo/performance/string_identifier_argument.yml b/.rubocop_todo/performance/string_identifier_argument.yml index 047a32c35f726c..d067d03315a966 100644 --- a/.rubocop_todo/performance/string_identifier_argument.yml +++ b/.rubocop_todo/performance/string_identifier_argument.yml @@ -184,12 +184,12 @@ Performance/StringIdentifierArgument: - 'spec/support/helpers/redis_helpers.rb' - 'spec/support/redis.rb' - 'spec/support/shared_contexts/requests/api/debian_repository_shared_context.rb' - - 'spec/support/shared_examples/banzai/filters/reference_filter_shared_examples.rb' - 'spec/support/shared_examples/ci/jwt_shared_examples.rb' - 'spec/support/shared_examples/controllers/application_settings_shared_examples.rb' - 'spec/support/shared_examples/controllers/githubish_import_controller_shared_examples.rb' - 'spec/support/shared_examples/database_health_status_indicators/prometheus_alert_based_shared_examples.rb' - 'spec/support/shared_examples/graphql/container_expiration_policy_shared_examples.rb' + - 'spec/support/shared_examples/lib/banzai/filters/reference_filter_shared_examples.rb' - 'spec/support/shared_examples/models/active_record_enum_shared_examples.rb' - 'spec/support/shared_examples/models/application_setting_shared_examples.rb' - 'spec/support/shared_examples/models/concerns/cascading_namespace_setting_shared_examples.rb' diff --git a/ee/lib/banzai/filter/references/iteration_reference_filter.rb b/ee/lib/banzai/filter/references/iteration_reference_filter.rb index 91a29bc31bdc48..cb0c4b9c5b5891 100644 --- a/ee/lib/banzai/filter/references/iteration_reference_filter.rb +++ b/ee/lib/banzai/filter/references/iteration_reference_filter.rb @@ -51,7 +51,10 @@ def parse_symbol(symbol, match_data) # holds the id { iteration_id: symbol.to_i, iteration_name: nil } else - { iteration_id: match_data[:iteration_id]&.to_i, iteration_name: match_data[:iteration_name]&.tr('"', '') } + { + iteration_id: match_data[:iteration_id]&.to_i, + iteration_name: match_data[:iteration_name]&.tr('"', '') + } end end @@ -80,23 +83,10 @@ def references_in(text, pattern = ::Iteration.reference_pattern) # default implementation. return super(text, pattern) if pattern != ::Iteration.reference_pattern - iterations = {} - - unescaped_html = unescape_html_entities(text).gsub(pattern).with_index do |match, index| - ident = identifier($~) - iteration = yield match, ident, $~[:project], $~[:namespace], $~ - - if iteration != match - iterations[index] = iteration - "#{::Banzai::Filter::References::AbstractReferenceFilter::REFERENCE_PLACEHOLDER}#{index}" - else - match - end + replace_references_in_text_with_html(text.gsub(pattern)) do |match_data| + ident = identifier(match_data) + yield match_data[0], ident, match_data[:project], match_data[:namespace], match_data end - - return text if iterations.empty? - - escape_with_placeholders(unescaped_html, iterations) end def find_iterations(parent, ids: nil, names: nil) diff --git a/ee/lib/banzai/filter/references/iterations_cadence_reference_filter.rb b/ee/lib/banzai/filter/references/iterations_cadence_reference_filter.rb index 50ccf109e82de6..789988e90f0703 100644 --- a/ee/lib/banzai/filter/references/iterations_cadence_reference_filter.rb +++ b/ee/lib/banzai/filter/references/iterations_cadence_reference_filter.rb @@ -46,7 +46,10 @@ def parent_records(parent, ids) def parse_symbol(symbol, match_data) return { cadence_id: symbol.to_i, cadence_title: nil } if symbol - { cadence_id: match_data[:cadence_id]&.to_i, cadence_title: match_data[:cadence_title]&.tr('"', '') } + { + cadence_id: match_data[:cadence_id]&.to_i, + cadence_title: match_data[:cadence_title]&.tr('"', '') + } end # This method has the contract that if a string `ref` refers to a @@ -57,22 +60,10 @@ def record_identifier(record) end def references_in(text, pattern = object_class.reference_pattern) - cadences = {} - - unescaped_html = unescape_html_entities(text).gsub(pattern).with_index do |match, index| - ident = identifier($~) - cadence = yield match, ident, nil, nil, $~ - - next match if cadence == match - - cadences[index] = cadence - - "#{::Banzai::Filter::References::AbstractReferenceFilter::REFERENCE_PLACEHOLDER}#{index}" + replace_references_in_text_with_html(text.gsub(pattern)) do |match_data| + ident = identifier(match_data) + yield match_data[0], ident, nil, nil, match_data end - - return text if cadences.empty? - - escape_with_placeholders(unescaped_html, cadences) end def url_for_object(cadence, group) diff --git a/ee/lib/ee/banzai/filter/references/epic_reference_filter.rb b/ee/lib/ee/banzai/filter/references/epic_reference_filter.rb index 80ad20809a91b6..a235456503a7f5 100644 --- a/ee/lib/ee/banzai/filter/references/epic_reference_filter.rb +++ b/ee/lib/ee/banzai/filter/references/epic_reference_filter.rb @@ -12,13 +12,11 @@ module EpicReferenceFilter extend ActiveSupport::Concern def references_in(text, pattern = object_class.reference_pattern) - ::Gitlab::Utils::Gsub - .gsub_with_limit(text, pattern, limit: ::Banzai::Filter::FILTER_ITEM_LIMIT) do |match_data| + replace_references_in_text_with_html(::Gitlab::Utils::Gsub.gsub_with_limit(text, pattern, + limit: ::Banzai::Filter::FILTER_ITEM_LIMIT)) do |match_data| symbol = match_data[object_sym] if object_class.reference_valid?(symbol) yield match_data[0], symbol.to_i, nil, match_data[:group], match_data - else - match_data[0] end end end @@ -32,9 +30,9 @@ def reference_class(object_sym, tooltip: false) super end - def data_attributes_for(text, group, object, link_content: false, link_reference: false) + def data_attributes_for(original, group, object, link_content: false, link_reference: false) { - original: text, + original: original, link: link_content, link_reference: link_reference, group: group.id, diff --git a/ee/lib/ee/banzai/filter/references/label_reference_filter.rb b/ee/lib/ee/banzai/filter/references/label_reference_filter.rb index eee57295f27cb1..6cdab6d09621e0 100644 --- a/ee/lib/ee/banzai/filter/references/label_reference_filter.rb +++ b/ee/lib/ee/banzai/filter/references/label_reference_filter.rb @@ -8,7 +8,7 @@ module LabelReferenceFilter extend ::Gitlab::Utils::Override override :data_attributes_for - def data_attributes_for(text, parent, object, link_content: false, link_reference: false) + def data_attributes_for(original, parent, object, link_content: false, link_reference: false) return super unless object.scoped_label? # Enabling HTML tooltips for scoped labels here. diff --git a/ee/lib/ee/banzai/filter/references/vulnerability_reference_filter.rb b/ee/lib/ee/banzai/filter/references/vulnerability_reference_filter.rb index 85611f630856ca..5b893e7172a59f 100644 --- a/ee/lib/ee/banzai/filter/references/vulnerability_reference_filter.rb +++ b/ee/lib/ee/banzai/filter/references/vulnerability_reference_filter.rb @@ -12,30 +12,22 @@ module VulnerabilityReferenceFilter extend ActiveSupport::Concern def references_in(text, pattern = object_class.reference_pattern) - text.gsub(pattern) do |match| - symbol = $~[object_sym] + replace_references_in_text_with_html(text.gsub(pattern)) do |match_data| + symbol = match_data[object_sym] if object_class.reference_valid?(symbol) - yield match, symbol.to_i, $~[:project], $~[:namespace], $~ - else - match + yield match_data[0], symbol.to_i, match_data[:project], match_data[:namespace], match_data end end end - def unescape_link(href) - return href if object_class.reference_pattern.match?(href) - - super - end - def url_for_object(vulnerability, project) urls = ::Gitlab::Routing.url_helpers urls.project_security_vulnerability_url(project, vulnerability, only_path: context[:only_path]) end - def data_attributes_for(text, project, object, link_content: false, link_reference: false) + def data_attributes_for(original, project, object, link_content: false, link_reference: false) { - original: text, + original: original, link: link_content, link_reference: link_reference, project: project.id, diff --git a/ee/spec/lib/banzai/filter/references/epic_reference_filter_spec.rb b/ee/spec/lib/banzai/filter/references/epic_reference_filter_spec.rb index 81324772abf176..549cffe43e9fdd 100644 --- a/ee/spec/lib/banzai/filter/references/epic_reference_filter_spec.rb +++ b/ee/spec/lib/banzai/filter/references/epic_reference_filter_spec.rb @@ -74,7 +74,7 @@ def doc(reference = nil) link = doc(reference).css('a').first expect(link).to have_attribute('data-original') - expect(link.attr('data-original')).to eq(CGI.escapeHTML(reference)) + expect(link.attr('data-original')).to eq_html(reference) end it 'includes a data-reference-format attribute' do @@ -102,13 +102,13 @@ def doc(reference = nil) it 'ignores invalid epic IIDs' do text = "Check &#{non_existing_record_iid}" - expect(doc(text).to_s).to include(ERB::Util.html_escape_once(text)) + expect(doc(text).to_s).to include_html(text) end it 'ignores out of range epic IDs' do text = "Check &1161452270761535925900804973910297" - expect(doc(text).to_s).to include(ERB::Util.html_escape_once(text)) + expect(doc(text).to_s).to include_html(text) end it 'does not process links containing epic numbers followed by text' do @@ -139,7 +139,7 @@ def doc(reference = nil) it 'ignores invalid epic IIDs' do text = "Check &#{non_existing_record_iid}" - expect(doc(text).to_s).to include(ERB::Util.html_escape_once(text)) + expect(doc(text).to_s).to include(text) end end @@ -203,13 +203,13 @@ def doc(reference = nil) it 'ignores a shorthand reference from another group' do text = "Check &#{epic.iid}" - expect(doc(text).to_s).to include(ERB::Util.html_escape_once(text)) + expect(doc(text).to_s).to include_html(text) end it 'ignores reference with incomplete group path' do text = "Check @#{epic.group.path}&#{epic.iid}" - expect(doc(text).to_s).to include(ERB::Util.html_escape_once(text)) + expect(doc(text).to_s).to include_html(text) end it 'links to a valid reference for full reference' do @@ -354,4 +354,9 @@ def doc(reference = nil) let(:filter_result) { reference_filter(text, project: nil, group: group) } let(:ends_with) { " #{CGI.escapeHTML(epic.to_reference(group))}

" } end + + it_behaves_like 'ReferenceFilter#references_in' do + let(:reference) { epic.to_reference(group) } + let(:filter_instance) { described_class.new(nil, { project: nil, group: group }) } + end end diff --git a/ee/spec/lib/banzai/filter/references/iteration_reference_filter_spec.rb b/ee/spec/lib/banzai/filter/references/iteration_reference_filter_spec.rb index 8d2307e51c54f3..27c338e245d40c 100644 --- a/ee/spec/lib/banzai/filter/references/iteration_reference_filter_spec.rb +++ b/ee/spec/lib/banzai/filter/references/iteration_reference_filter_spec.rb @@ -2,7 +2,7 @@ require 'spec_helper' -RSpec.describe Banzai::Filter::References::IterationReferenceFilter do +RSpec.describe Banzai::Filter::References::IterationReferenceFilter, feature_category: :markdown do include FilterSpecHelper let(:parent_group) { create(:group, :public) } @@ -331,4 +331,23 @@ end.not_to exceed_all_query_limit(max_count) end end + + it_behaves_like 'a reference which does not unescape its content in data-original' do + let(:context) { { project: nil, group: group } } + let(:iteration_title) { "x<script>alert('xss');</script>" } + let(:resource) { create(:iteration, title: iteration_title, group: group) } + let(:reference) { %(#{resource.class.reference_prefix}"#{iteration_title}") } + + # This is probably bad. + let(:expected_resource_title) { "x" } + + let(:expected_href) { urls.iteration_url(resource) } + let(:expected_replacement_text) { resource.display_text } + end + + it_behaves_like 'ReferenceFilter#references_in' do + let(:iteration) { create(:iteration, :with_title, group: group) } + let(:reference) { %(#{Iteration.reference_prefix}"#{iteration.name}") } + let(:filter_instance) { described_class.new(nil, { project: nil }) } + end end diff --git a/ee/spec/lib/banzai/filter/references/iterations_cadence_reference_filter_spec.rb b/ee/spec/lib/banzai/filter/references/iterations_cadence_reference_filter_spec.rb index e7686c03cda94d..f0715d8a86b412 100644 --- a/ee/spec/lib/banzai/filter/references/iterations_cadence_reference_filter_spec.rb +++ b/ee/spec/lib/banzai/filter/references/iterations_cadence_reference_filter_spec.rb @@ -39,7 +39,7 @@ link = doc(text).css('a').first expect(link).to have_attribute('data-original') - expect(link.attr('data-original')).to eq(reference) + expect(link.attr('data-original')).to eq_html(reference) end end @@ -119,4 +119,21 @@ def doc(text) expect { reference_filter(markdown, context) }.not_to exceed_all_query_limit(1) end end + + it_behaves_like 'a reference which does not unescape its content in data-original' do + let(:context) { { project: nil, group: group } } + let(:cadence_title) { "x" } + let(:cadence_title_escaped) { "x<script>alert('xss');</script>" } + let(:resource) { create(:iterations_cadence, title: cadence_title, group: group) } + let(:reference) { %(#{resource.class.reference_prefix}"#{cadence_title_escaped}"#{resource.class.reference_postfix}) } + + let(:expected_resource_title) { cadence_title } + let(:expected_href) { urls.group_iteration_cadences_url(group, resource) } + let(:expected_replacement_text) { "#{resource.class.reference_prefix}#{resource.id}#{resource.class.reference_postfix}" } + end + + it_behaves_like 'ReferenceFilter#references_in' do + let(:reference) { cadence.to_reference } + let(:filter_instance) { described_class.new(nil, { project: nil }) } + end end diff --git a/ee/spec/lib/banzai/filter/references/vulnerability_reference_filters_spec.rb b/ee/spec/lib/banzai/filter/references/vulnerability_reference_filters_spec.rb index d5fcf17b5e59cf..d67a2971f8825c 100644 --- a/ee/spec/lib/banzai/filter/references/vulnerability_reference_filters_spec.rb +++ b/ee/spec/lib/banzai/filter/references/vulnerability_reference_filters_spec.rb @@ -2,7 +2,7 @@ require 'spec_helper' -RSpec.describe Banzai::Filter::References::VulnerabilityReferenceFilter do +RSpec.describe Banzai::Filter::References::VulnerabilityReferenceFilter, feature_category: :markdown do include FilterSpecHelper let(:urls) { Gitlab::Routing.url_helpers } @@ -198,4 +198,9 @@ def doc(reference = nil) expect { reference_filter(multiple_references).to_html }.not_to exceed_query_limit(control) end end + + it_behaves_like 'ReferenceFilter#references_in' do + let(:reference) { "[vulnerability:#{vulnerability.id}]" } + let(:filter_instance) { described_class.new(nil, { project: }) } + end end diff --git a/lib/banzai/filter/playable_link_filter.rb b/lib/banzai/filter/playable_link_filter.rb index a56961a0c26cdd..77d64965ef2b11 100644 --- a/lib/banzai/filter/playable_link_filter.rb +++ b/lib/banzai/filter/playable_link_filter.rb @@ -55,14 +55,14 @@ def media_element(doc, element) end def download_link(doc, element) - link_content = element['title'] || element['alt'] + link_text = element['title'] || element['alt'] link_element_attrs = { class: 'gl-text-sm gl-text-subtle gl-mb-1', href: element['src'], target: '_blank', rel: 'noopener noreferrer', - title: "Download '#{link_content}'" + title: "Download '#{link_text}'" } # make sure the original non-proxied src carries over @@ -70,7 +70,7 @@ def download_link(doc, element) link_element_attrs['data-canonical-src'] = element['data-canonical-src'] end - doc.document.create_element('a', link_content, link_element_attrs) + doc.document.create_element('a', link_text, link_element_attrs) end def media_node(doc, element) diff --git a/lib/banzai/filter/references/abstract_reference_filter.rb b/lib/banzai/filter/references/abstract_reference_filter.rb index dc21b224b1d3a7..b5a0a0d7dfe9c3 100644 --- a/lib/banzai/filter/references/abstract_reference_filter.rb +++ b/lib/banzai/filter/references/abstract_reference_filter.rb @@ -16,47 +16,31 @@ def initialize(doc, context = nil, result = nil) @reference_cache = ReferenceCache.new(self, context, result) end - # REFERENCE_PLACEHOLDER is used for re-escaping HTML text except found - # reference (which we replace with placeholder during re-scaping). The - # random number helps ensure it's pretty close to unique. Since it's a - # transitory value (it never gets saved) we can initialize once, and it - # doesn't matter if it changes on a restart. - REFERENCE_PLACEHOLDER = "_reference_#{SecureRandom.hex(16)}_" - REFERENCE_PLACEHOLDER_PATTERN = %r{#{REFERENCE_PLACEHOLDER}(\d+)} - - # Public: Find references in text (like `!123` for merge requests) + # See ReferenceFilter#references_in for requirements on the input, block, + # and return value of this function. # - # references_in(text) do |match, id, project_ref, matches| - # object = find_object(project_ref, id) - # "#{object.to_reference}" - # end - # - # text - String text to search. - # - # Yields the String match, the Integer referenced object ID, an optional String - # of the external project reference, and all of the matchdata. - # - # Returns a String replaced with the return of the block. + # This function yields the String match, the Integer referenced object ID, an optional String + # of the external project reference, an optional String of the namespace reference, + # and the full MatchData. def references_in(text, pattern = object_class.reference_pattern) - Gitlab::Utils::Gsub.gsub_with_limit(text, pattern, limit: Banzai::Filter::FILTER_ITEM_LIMIT) do |match_data| + replace_references_in_text_with_html(Gitlab::Utils::Gsub.gsub_with_limit(text, pattern, + limit: Banzai::Filter::FILTER_ITEM_LIMIT)) do |match_data| if ident = identifier(match_data) - yield match_data[0], ident, match_data.named_captures['project'], match_data.named_captures['namespace'], - match_data - else - match_data[0] + yield match_data[0], ident, match_data.named_captures['project'], + match_data.named_captures['namespace'], match_data end end end def identifier(match_data) - symbol = symbol_from_match(match_data) + symbol = symbol_from_match_data(match_data) parse_symbol(symbol, match_data) if object_class.reference_valid?(symbol) end - def symbol_from_match(match) + def symbol_from_match_data(match_data) key = object_sym - match[key] if match.names.include?(key.to_s) + match_data[key] if match_data.names.include?(key.to_s) end # Transform a symbol extracted from the text to a meaningful value @@ -133,34 +117,32 @@ def call nodes.each_with_index do |node, index| if text_node?(node) && ref_pattern - replace_text_when_pattern_matches(node, index, ref_pattern) do |content| + replace_node_when_text_matches(node, index, ref_pattern) do |content| object_link_filter(content, ref_pattern) end elsif element_node?(node) - yield_valid_link(node) do |link, inner_html| + yield_valid_link(node) do |link, text, inner_html| if ref_pattern && link =~ ref_pattern_anchor - replace_link_node_with_href(node, index, link) do - object_link_filter(link, ref_pattern_anchor, link_content: inner_html) - end + html = object_link_filter(link, ref_pattern_anchor, link_content_html: inner_html) + replace_node_with_html(node, index, html) if html next end next unless link_pattern - if link == inner_html && inner_html =~ link_pattern_start - replace_link_node_with_text(node, index) do - object_link_filter(inner_html, link_pattern_start, link_reference: true) - end + if link == text && text =~ link_pattern_start + html = object_link_filter(text, link_pattern_start, link_reference: true) + replace_node_with_html(node, index, html) if html next end if link_pattern_anchor.match?(link) - replace_link_node_with_href(node, index, link) do - object_link_filter(link, link_pattern_anchor, link_content: inner_html, link_reference: true) - end + html = object_link_filter(link, link_pattern_anchor, link_content_html: inner_html, + link_reference: true) + replace_node_with_html(node, index, html) if html next end @@ -176,14 +158,21 @@ def call # # text - String text to replace references in. # pattern - Reference pattern to match against. - # link_content - Original content of the link being replaced. + # link_content_html - Original HTML content of the link being replaced. # link_reference - True if this was using the link reference pattern, # false otherwise. # - # Returns a String with references replaced with links. All links - # have `gfm` and `gfm-OBJECT_NAME` class names attached for styling. - def object_link_filter(text, pattern, link_content: nil, link_reference: false) - references_in(text, pattern) do |match, id, project_ref, namespace_ref, matches| + # Returns String HTML with references replaced with links, or nil if no + # replacements were made. All links have `gfm` and `gfm-OBJECT_NAME` + # class names attached for styling. + # + # Note carefully: + # + # * `text` is text, not HTML. + # * `link_content_html`, if provided, is HTML. + # * The return value is HTML (or nil). + def object_link_filter(text, pattern, link_content_html: nil, link_reference: false) + references_in(text, pattern) do |match_text, id, project_ref, namespace_ref, matches| parent_path = if parent_type == :group reference_cache.full_group_path(namespace_ref) elsif parent_type == :namespace @@ -193,54 +182,51 @@ def object_link_filter(text, pattern, link_content: nil, link_reference: false) end parent = from_ref_cached(parent_path) + next unless parent - if parent - object = - if link_reference - find_object_from_link_cached(parent, id) - else - find_object_cached(parent, id) - end - end + object = + if link_reference + find_object_from_link_cached(parent, id) + else + find_object_cached(parent, id) + end - if object - title = object_link_title(object, matches) - klass = reference_class(object_sym) - - data_attributes = data_attributes_for( - link_content || match, - parent, - object, - link_content: !!link_content, - link_reference: link_reference - ) - data_attributes[:reference_format] = matches[:format] if matches.names.include?("format") - data_attributes.merge!(additional_object_attributes(object)) - - data = data_attribute(data_attributes) - - url = - if matches.names.include?("url") && matches[:url] - matches[:url] - else - url_for_object_cached(object, parent) - end + next unless object - url.chomp!(matches[:format]) if matches.names.include?("format") + title = object_link_title(object, matches) + klass = reference_class(object_sym) - content = context[:link_text] || link_content || object_link_text(object, matches) + data_attributes = data_attributes_for( + link_content_html || CGI.escapeHTML(match_text), + parent, + object, + link_content: !!link_content_html, + link_reference: link_reference + ) + data_attributes[:reference_format] = matches[:format] if matches.names.include?("format") + data_attributes.merge!(additional_object_attributes(object)) - link = write_opening_tag("a", { - "href" => url, - "title" => title, - "class" => klass, - **data - }) << content.to_s << "" + data = data_attribute(data_attributes) - wrap_link(link, object) - else - match - end + url = + if matches.names.include?("url") && matches[:url] + matches[:url] + else + url_for_object_cached(object, parent) + end + + url.chomp!(matches[:format]) if matches.names.include?("format") + + content = context[:link_text] || link_content_html || object_link_text(object, matches) + + link = write_opening_tag("a", { + "href" => url, + "title" => title, + "class" => klass, + **data + }) << content.to_s << "" + + wrap_link(link, object) end end @@ -248,7 +234,9 @@ def wrap_link(link, object) link end - def data_attributes_for(text, parent, object, link_content: false, link_reference: false) + # "link_content" is true when "original" is the inner HTML content, or false when "original" + # is HTML-escaped plain text representing the link as written. + def data_attributes_for(original, parent, object, link_content: false, link_reference: false) parent_id = case parent when Group { group: parent.id, namespace: parent.id } @@ -259,7 +247,7 @@ def data_attributes_for(text, parent, object, link_content: false, link_referenc end { - original: text, + original: original, link: link_content, link_reference: link_reference, object_sym => object.id @@ -306,14 +294,6 @@ def parent attr_accessor :reference_cache - def escape_with_placeholders(text, placeholder_data) - escaped = escape_html_entities(text) - - escaped.gsub(REFERENCE_PLACEHOLDER_PATTERN) do |match| - placeholder_data[Regexp.last_match(1).to_i] - end - end - def additional_object_attributes(object) {} end diff --git a/lib/banzai/filter/references/commit_range_reference_filter.rb b/lib/banzai/filter/references/commit_range_reference_filter.rb index 34ba4f7d4edb03..38506972b34dba 100644 --- a/lib/banzai/filter/references/commit_range_reference_filter.rb +++ b/lib/banzai/filter/references/commit_range_reference_filter.rb @@ -11,7 +11,9 @@ class CommitRangeReferenceFilter < AbstractReferenceFilter self.object_class = CommitRange def references_in(text, pattern = object_reference_pattern) - Gitlab::Utils::Gsub.gsub_with_limit(text, pattern, limit: Banzai::Filter::FILTER_ITEM_LIMIT) do |match_data| + replace_references_in_text_with_html( + Gitlab::Utils::Gsub.gsub_with_limit(text, pattern, + limit: Banzai::Filter::FILTER_ITEM_LIMIT)) do |match_data| yield match_data[0], match_data[:commit_range], match_data[:project], match_data[:namespace], match_data end diff --git a/lib/banzai/filter/references/commit_reference_filter.rb b/lib/banzai/filter/references/commit_reference_filter.rb index 27d1fb52a3bf56..685c40c280daaa 100644 --- a/lib/banzai/filter/references/commit_reference_filter.rb +++ b/lib/banzai/filter/references/commit_reference_filter.rb @@ -11,7 +11,8 @@ class CommitReferenceFilter < AbstractReferenceFilter self.object_class = Commit def references_in(text, pattern = object_reference_pattern) - Gitlab::Utils::Gsub.gsub_with_limit(text, pattern, limit: Banzai::Filter::FILTER_ITEM_LIMIT) do |match_data| + replace_references_in_text_with_html(Gitlab::Utils::Gsub.gsub_with_limit(text, pattern, + limit: Banzai::Filter::FILTER_ITEM_LIMIT)) do |match_data| yield match_data[0], match_data[:commit], match_data[:project], match_data[:namespace], match_data end diff --git a/lib/banzai/filter/references/design_reference_filter.rb b/lib/banzai/filter/references/design_reference_filter.rb index db736b7328a20b..edc84fe0a800cf 100644 --- a/lib/banzai/filter/references/design_reference_filter.rb +++ b/lib/banzai/filter/references/design_reference_filter.rb @@ -66,7 +66,7 @@ def url_for_object(design, project) Gitlab::Routing.url_helpers.designs_project_issue_path(project, design.issue, path_options) end - def data_attributes_for(_text, _project, design, **_kwargs) + def data_attributes_for(_original, _project, design, **_kwargs) super.merge(issue: design.issue_id) end diff --git a/lib/banzai/filter/references/external_issue_reference_filter.rb b/lib/banzai/filter/references/external_issue_reference_filter.rb index 6a4ee2e6a06174..4c36ee373807d7 100644 --- a/lib/banzai/filter/references/external_issue_reference_filter.rb +++ b/lib/banzai/filter/references/external_issue_reference_filter.rb @@ -24,25 +24,30 @@ def self.default_issues_tracker?(project) # Public: Find `JIRA-123` issue references in text # - # references_in(text, pattern) do |match, issue| + # references_in(text, pattern) do |match_text, issue| # "##{issue}" # end # # text - String text to search. # - # Yields the String match and the String issue reference. + # Yields the String text match and the String issue reference. # - # Returns a String replaced with the return of the block. + # Returns a HTML String replaced with the return of the block. + # + # See ReferenceFilter#references_in for a detailed discussion. def references_in(text, pattern = object_reference_pattern) - case pattern - when Regexp - Gitlab::Utils::Gsub.gsub_with_limit(text, pattern, limit: Banzai::Filter::FILTER_ITEM_LIMIT) do |match_data| - yield match_data[0], match_data[:issue] - end - when Gitlab::UntrustedRegexp - pattern.replace_gsub(text, limit: Banzai::Filter::FILTER_ITEM_LIMIT) do |match| - yield match, match[:issue] + enumerator = + case pattern + when Regexp + Gitlab::Utils::Gsub.gsub_with_limit(text, pattern, limit: Banzai::Filter::FILTER_ITEM_LIMIT) + when Gitlab::UntrustedRegexp + pattern.replace_gsub(text, limit: Banzai::Filter::FILTER_ITEM_LIMIT) + else + return end + + replace_references_in_text_with_html(enumerator) do |match_data| + yield match_data[0], match_data[:issue] end end @@ -59,15 +64,17 @@ def call # issue's details page. # # text - String text to replace references in. - # link_content - Original content of the link being replaced. + # link_content_html - Original HTML content of the link being replaced. # - # Returns a String with `JIRA-123` references replaced with links. All + # Returns a HTML String with `JIRA-123` references replaced with links. All # links have `gfm` and `gfm-issue` class names attached for styling. - def object_link_filter(text, pattern, link_content: nil, link_reference: false) - references_in(text) do |match, id| + # + # Returns nil if no replacements are made. + def object_link_filter(text, pattern, link_content_html: nil, link_reference: false) + references_in(text) do |match_text, id| klass = reference_class(:issue) data = data_attribute(project: project.id, external_issue: id) - content = link_content || match + content = link_content_html || CGI.escapeHTML(match_text) write_opening_tag("a", { "title" => issue_title, diff --git a/lib/banzai/filter/references/issue_reference_filter.rb b/lib/banzai/filter/references/issue_reference_filter.rb index 3222f4eea9181b..bb7b48cd79e8ce 100644 --- a/lib/banzai/filter/references/issue_reference_filter.rb +++ b/lib/banzai/filter/references/issue_reference_filter.rb @@ -38,7 +38,7 @@ def reference_class(object_sym, tooltip: false) super end - def data_attributes_for(text, parent, object, **data) + def data_attributes_for(original, parent, object, **data) additional_attributes = { iid: object.iid, namespace_path: parent.full_path } if parent.is_a?(Namespaces::ProjectNamespace) || parent.is_a?(Project) additional_attributes[:project_path] = parent.full_path diff --git a/lib/banzai/filter/references/label_reference_filter.rb b/lib/banzai/filter/references/label_reference_filter.rb index 10399f05631430..07ad15247528d0 100644 --- a/lib/banzai/filter/references/label_reference_filter.rb +++ b/lib/banzai/filter/references/label_reference_filter.rb @@ -70,23 +70,10 @@ def record_identifier(record) end def references_in(text, pattern = Label.reference_pattern) - labels = {} - - unescaped_html = unescape_html_entities(text).gsub(pattern).with_index do |match, index| - ident = identifier($~) - label = yield match, ident, $~[:project], $~[:namespace], $~ - - if label != match - labels[index] = label - "#{REFERENCE_PLACEHOLDER}#{index}" - else - match - end + replace_references_in_text_with_html(text.gsub(pattern)) do |match_data| + ident = identifier(match_data) + yield match_data[0], ident, match_data[:project], match_data[:namespace], match_data end - - return text if labels.empty? - - escape_with_placeholders(unescaped_html, labels) end def find_labels(parent, absolute_path: false) diff --git a/lib/banzai/filter/references/merge_request_reference_filter.rb b/lib/banzai/filter/references/merge_request_reference_filter.rb index 271c34fe406748..9f11c5134ca895 100644 --- a/lib/banzai/filter/references/merge_request_reference_filter.rb +++ b/lib/banzai/filter/references/merge_request_reference_filter.rb @@ -52,7 +52,7 @@ def reference_class(object_sym, tooltip: false) super end - def data_attributes_for(text, parent, object, **data) + def data_attributes_for(original, parent, object, **data) super.merge(project_path: parent.full_path, iid: object.iid) end diff --git a/lib/banzai/filter/references/milestone_reference_filter.rb b/lib/banzai/filter/references/milestone_reference_filter.rb index fa1fd9baaf916a..df1d4595ce5530 100644 --- a/lib/banzai/filter/references/milestone_reference_filter.rb +++ b/lib/banzai/filter/references/milestone_reference_filter.rb @@ -58,7 +58,11 @@ def parse_symbol(symbol, match_data) # holds the iid { milestone_iid: symbol.to_i, milestone_name: nil, absolute_path: absolute_path } else - { milestone_iid: match_data[:milestone_iid]&.to_i, milestone_name: match_data[:milestone_name]&.tr('"', ''), absolute_path: absolute_path } + { + milestone_iid: match_data[:milestone_iid]&.to_i, + milestone_name: match_data[:milestone_name]&.tr('"', ''), + absolute_path: absolute_path + } end end @@ -87,23 +91,10 @@ def references_in(text, pattern = Milestone.reference_pattern) # default implementation. return super(text, pattern) if pattern != Milestone.reference_pattern - milestones = {} - - unescaped_html = unescape_html_entities(text).gsub(pattern).with_index do |match, index| - ident = identifier($~) - milestone = yield match, ident, $~[:project], $~[:namespace], $~ - - if milestone != match - milestones[index] = milestone - "#{REFERENCE_PLACEHOLDER}#{index}" - else - match - end + replace_references_in_text_with_html(text.gsub(pattern)) do |match_data| + ident = identifier(match_data) + yield match_data[0], ident, match_data[:project], match_data[:namespace], match_data end - - return text if milestones.empty? - - escape_with_placeholders(unescaped_html, milestones) end def find_milestones(parent, find_by_iid = false, absolute_path: false) @@ -159,7 +150,7 @@ def requires_unescaping? true end - def data_attributes_for(text, parent, object, link_content: false, link_reference: false) + def data_attributes_for(original, parent, object, link_content: false, link_reference: false) object_parent = object.resource_parent return super unless object_parent.is_a?(Group) diff --git a/lib/banzai/filter/references/project_reference_filter.rb b/lib/banzai/filter/references/project_reference_filter.rb index 52688a15245b75..3805ac20ee0a88 100644 --- a/lib/banzai/filter/references/project_reference_filter.rb +++ b/lib/banzai/filter/references/project_reference_filter.rb @@ -10,17 +10,20 @@ class ProjectReferenceFilter < ReferenceFilter # Public: Find `namespace/project>` project references in text # - # references_in(text) do |match, project| + # references_in(text) do |match_text, project| # "#{project}>" # end # # text - String text to search. # - # Yields the String match, and the String project name. + # Yields the String text match, and the String project name. # - # Returns a String replaced with the return of the block. + # Returns a HTML String with replacements made, or nil if no replacements were made. + # + # See ReferenceFilter#references_in for a detailed discussion. def references_in(text, pattern = object_reference_pattern) - Gitlab::Utils::Gsub.gsub_with_limit(text, pattern, limit: Banzai::Filter::FILTER_ITEM_LIMIT) do |match_data| + replace_references_in_text_with_html(Gitlab::Utils::Gsub.gsub_with_limit(text, pattern, + limit: Banzai::Filter::FILTER_ITEM_LIMIT)) do |match_data| yield match_data[0], "#{match_data[:namespace]}/#{match_data[:project]}" end end @@ -35,17 +38,15 @@ def object_reference_pattern # project page. # # text - String text to replace references in. - # link_content - Original content of the link being replaced. + # link_content_html - Original HTML content of the link being replaced. # - # Returns a String with `namespace/project>` references replaced with links. All links - # have `gfm` and `gfm-project` class names attached for styling. - def object_link_filter(text, pattern, link_content: nil, link_reference: false) - references_in(text) do |match, project_path| - cached_call(:banzai_url_for_object, match, path: [Project, project_path.downcase]) do + # Returns a String HTML with `namespace/project>` references replaced with links, or nil + # if no replacements were made. All links have `gfm` and `gfm-project` class names attached for styling. + def object_link_filter(text, pattern, link_content_html: nil, link_reference: false) + references_in(text) do |match_text, project_path| + cached_call(:banzai_url_for_object, match_text, path: [Project, project_path.downcase]) do if project = projects_hash[project_path.downcase] - link_to_project(project, link_content: link_content) || match - else - match + link_to_project(project, link_content_html: link_content_html) end end end @@ -85,10 +86,10 @@ def link_class reference_class(:project) end - def link_to_project(project, link_content: nil) + def link_to_project(project, link_content_html: nil) url = urls.project_url(project, only_path: context[:only_path]) data = data_attribute(project: project.id) - content = link_content || project.to_reference + content = link_content_html || CGI.escapeHTML(project.to_reference) write_opening_tag("a", { "href" => url, diff --git a/lib/banzai/filter/references/reference_cache.rb b/lib/banzai/filter/references/reference_cache.rb index 97bb24376c00fa..82525a978f1ca2 100644 --- a/lib/banzai/filter/references/reference_cache.rb +++ b/lib/banzai/filter/references/reference_cache.rb @@ -203,16 +203,12 @@ def refs_cache end def prepare_doc_for_scan - filter.requires_unescaping? ? unescape_html_entities(html_content) : html_content + filter.requires_unescaping? ? CGI.unescapeHTML(html_content) : html_content end def html_content result[:rendered_html] ||= filter.doc.to_html end - - def unescape_html_entities(text) - CGI.unescapeHTML(text.to_s) - end end end end diff --git a/lib/banzai/filter/references/reference_filter.rb b/lib/banzai/filter/references/reference_filter.rb index 07916f69e60f01..41d2220498df59 100644 --- a/lib/banzai/filter/references/reference_filter.rb +++ b/lib/banzai/filter/references/reference_filter.rb @@ -20,6 +20,13 @@ class ReferenceFilter < HTML::Pipeline::Filter REFERENCE_TYPE_ATTRIBUTE = :reference_type REFERENCE_TYPE_DATA_ATTRIBUTE_NAME = "data-#{REFERENCE_TYPE_ATTRIBUTE.to_s.dasherize}".freeze + # REFERENCE_PLACEHOLDER is used for replacement HTML to be inserted during + # reference matching. The random string helps ensure it's pretty close to unique. + # Since it's a transitory value (it never leaves the filter) we can initialize once, + # and it doesn't matter if it changes on a restart. + REFERENCE_PLACEHOLDER = "_reference_#{SecureRandom.hex(16)}_".freeze + REFERENCE_PLACEHOLDER_PATTERN = %r{#{REFERENCE_PLACEHOLDER}(\d+)} + class << self # Implement in child class # Example: self.reference_type = :merge_request @@ -50,15 +57,14 @@ def call nodes.each_with_index do |node, index| if text_node?(node) - replace_text_when_pattern_matches(node, index, object_reference_pattern) do |content| + replace_node_when_text_matches(node, index, object_reference_pattern) do |content| object_link_filter(content, object_reference_pattern) end elsif element_node?(node) - yield_valid_link(node) do |link, inner_html| + yield_valid_link(node) do |link, _text, inner_html| if ref_pattern_start.match?(link) - replace_link_node_with_href(node, index, link) do - object_link_filter(link, ref_pattern_start, link_content: inner_html) - end + html = object_link_filter(link, ref_pattern_start, link_content_html: inner_html) + replace_node_with_html(node, index, html) if html end end end @@ -67,23 +73,107 @@ def call doc end - # Public: Find references in text (like `!123` for merge requests) + # Public: Find references in text (like `!123` for merge requests). + # + # Simplified usage (see AbstractReferenceFilter#object_link_filter for a full example): # - # references_in(text) do |match, id, project_ref, matches| - # object = find_object(project_ref, id) - # "#{object.to_reference}" + # references_in(text) do |match_text, id, project_ref, namespace_ref, matches| + # object = find_object(namespace_ref, project_ref, id) + # if object + # write_opening_tag("a", { + # "href" => "...", + # # ... other attributes ... + # }) << CGI.escapeHTML(object.to_reference) << "" + # else + # CGI.escapeHTML(match_text) + # end # end # - # text - String text to search. + # text - String text to make replacements in. This is the content of a DOM text + # node, and *cannot* be returned without modification. + # + # Yields each String text match as the first argument; the remaining arguments + # depend on the particular subclass of ReferenceFilter being used. The block + # must return HTML, and not text. + # + # Returns a HTML String replaced with replacements made, or nil if no replacements + # were made. + # + # Notes: + # + # * The input is text -- we expect to match e.g. &1 or %"Git 2.5" without having to + # deal with HTML entities, as our object reference patterns are not compatible + # with entities. # - # Yields the String match, the Integer referenced object ID, an optional String - # of the external project reference, and all of the matchdata. + # * Likewise, any strings yielded to the block are text, but the block must return + # HTML, whether it's a successful resolution or not. Any text input used in the + # block must be escaped for return. # - # Returns a String replaced with the return of the block. + # * The return value is HTML, as the whole point is to create links (and possibly other + # elements). Care must be taken that any input text is escaped before reaching + # the output. This must be respected in all subclasses of ReferenceFilter. + # + # See AbstractReferenceFilter#references_in for a reference implementation that safely + # handles user input. def references_in(text, pattern = object_reference_pattern) raise NotImplementedError, "#{self.class} must implement method: #{__callee__}" end + # A helper to make it easier to implement #references_in correctly. + # + # Pass in the replacement Enumerator of choice, on *plain text* input. + # This will often look like this: + # + # Gitlab::Utils::Gsub.gsub_with_limit(text, pattern, limit: Banzai::Filter::FILTER_ITEM_LIMIT) + # + # Or the simpler: + # + # text.gsub(pattern) + # + # In both cases, the subject we're performing replacements on is text, not HTML. + # + # This function will yield MatchData from the underlying enumerator -- $~ if it's available, + # otherwise the yield from the enumerator itself is asserted to be MatchData or RE2::MatchData. + # The given block should return HTML (!) to substitute in place, or nil if no replacement + # is to be made. + # + # The function returns HTML, safely substituting the block results in, while escaping all + # other text. The supplied block MUST escape any input text returned within it, if any. + # + # If no replacements were made --- that is, the enumerator didn't yield anything, or the block + # given to this function only returned nil --- nil is returned. + def replace_references_in_text_with_html(enumerator) + replacements = {} + + # We operate on text, yield text, and substitute back in only text. + replaced_text = enumerator.with_index do |enumerator_yielded, index| + match_data = $~ || enumerator_yielded + unless match_data.is_a?(MatchData) || match_data.is_a?(RE2::MatchData) + raise ArgumentError, "enumerator didn't yield MatchData (is #{match_data.inspect}) and $~ is unavailable" + end + + replacement = yield match_data + if replacement + # The yield returns HTML to us, but we can't substitute it back in yet --- there + # remains unescaped, textual content in unmatched parts of the string which we + # need to escape without affecting the block yields. Instead, store the result, + # and substitute back into the text a placeholder we can replace after escaping. + replacements[index] = replacement + "#{REFERENCE_PLACEHOLDER}#{index}" + else + match_data.to_s + end + end + + return unless replacements.any? + + # Escape the replaced_text --- which doesn't change the placeholders --- and only then + # replace the placeholders with the yielded HTML. + CGI.escapeHTML(replaced_text).gsub(REFERENCE_PLACEHOLDER_PATTERN) do + replacements[Regexp.last_match(1).to_i] + end + end + # Iterates over all and text() nodes in a document. # # Nodes are skipped whenever their ancestor is one of the nodes returned @@ -184,50 +274,44 @@ def reference_class(type, tooltip: true) "#{gfm_klass} has-tooltip" end - # Yields the link's URL and inner HTML whenever the node is a valid tag. + # Yields the link's URL, inner text, and inner HTML whenever the node is a valid tag. + # + # We expect that the URL and inner *text* will be equal when the link was a result of + # an autolink in Markdown. For example: + # + # ```markdown + # + # ``` + # + # produces the HTML: + # + # ```html + # https://gitlab.com/x?y="z"&fox + # ``` + # + # The href DOM attribute, after percent-decoding, is 'https://gitlab.com/x?y="z"&fox'. + # The node's inner text is identical: 'https://gitlab.com/x?y="z"&fox'. + # The node's inner HTML is 'https://gitlab.com/x?y="z"&fox'. def yield_valid_link(node) - link = unescape_link(node.attr('href').to_s) - inner_html = node.inner_html - - return unless link.force_encoding('UTF-8').valid_encoding? - - yield link, inner_html - end - - def unescape_link(href) # We cannot use CGI.unescape here because it also converts `+` to spaces. - # We need to keep the `+` for expanded reference formats. - Addressable::URI.unescape(href) - end - - def unescape_html_entities(text) - CGI.unescapeHTML(text.to_s) - end - - def escape_html_entities(text) - CGI.escapeHTML(text.to_s) - end + # We need to keep the `+` for expanded reference formats; we want to only + # percent-decode here. + link = Addressable::URI.unescape(node.attr('href').to_s) - def replace_text_when_pattern_matches(node, index, pattern) - return if pattern.is_a?(Gitlab::UntrustedRegexp) && !pattern.match?(node.text) - return if pattern.is_a?(Regexp) && !(pattern =~ node.text) - - content = node.to_html - html = yield content + return unless link.force_encoding('UTF-8').valid_encoding? - replace_text_with_html(node, index, html) unless html == content + yield link, node.text, node.inner_html end - def replace_link_node_with_text(node, index) - html = yield + # Replaces a node with HTML obtained by yielding node.text, iff node.text matches the given pattern. + # Doesn't replace anything if the block returns nil. + def replace_node_when_text_matches(node, index, pattern) + node_text = node.text - replace_text_with_html(node, index, html) unless html == node.text - end + return unless pattern.match?(node_text) - def replace_link_node_with_href(node, index, link) - html = yield - - replace_text_with_html(node, index, html) unless html == link + html = yield node_text + replace_node_with_html(node, index, html) if html end def text_node?(node) @@ -250,7 +334,7 @@ def object_sym @object_sym ||= object_name.to_sym end - def object_link_filter(text, pattern, link_content: nil, link_reference: false) + def object_link_filter(text, pattern, link_content_html: nil, link_reference: false) raise NotImplementedError, "#{self.class} must implement method: #{__callee__}" end @@ -261,11 +345,9 @@ def query ]} end - def replace_text_with_html(node, index, html) - replace_and_update_new_nodes(node, index, html) - end + def replace_node_with_html(node, index, html) + return if node.to_html == html - def replace_and_update_new_nodes(node, index, html) previous_node = node.previous next_node = node.next parent_node = node.parent diff --git a/lib/banzai/filter/references/user_reference_filter.rb b/lib/banzai/filter/references/user_reference_filter.rb index f6ea1713bb9f36..6e96fceccc0c13 100644 --- a/lib/banzai/filter/references/user_reference_filter.rb +++ b/lib/banzai/filter/references/user_reference_filter.rb @@ -14,18 +14,21 @@ class UserReferenceFilter < ReferenceFilter # Public: Find `@user` user references in text # - # references_in(text) do |match, username| + # references_in(text) do |match_text, username| # "@#{user}" # end # # text - String text to search. # - # Yields the String match, and the String user name. + # Yields the String match text, and the String user name. # - # Returns a String replaced with the return of the block. + # Returns a HTML String with replacements made, or nil if no replacements were made. + # + # See ReferenceFilter#references_in for a detailed discussion. def references_in(text, pattern = object_reference_pattern) - Gitlab::Utils::Gsub.gsub_with_limit(text, pattern, limit: Banzai::Filter::FILTER_ITEM_LIMIT) do |match_data| - yield match_data[0], match_data['user'] + replace_references_in_text_with_html(Gitlab::Utils::Gsub.gsub_with_limit(text, pattern, + limit: Banzai::Filter::FILTER_ITEM_LIMIT)) do |match_data| + yield match_data[0], match_data[:user] end end @@ -41,23 +44,23 @@ def call # user's profile page. # # text - String text to replace references in. - # link_content - Original content of the link being replaced. + # link_content_html - Original HTML content of the link being replaced. # - # Returns a String with `@user` references replaced with links. All links + # Returns a HTML String with `@user` references replaced with links. All links # have `gfm` and `gfm-project_member` class names attached for styling. - def object_link_filter(text, pattern, link_content: nil, link_reference: false) - references_in(text, pattern) do |match, username| + # + # Returns nil if no replacements were made. + def object_link_filter(text, pattern, link_content_html: nil, link_reference: false) + references_in(text, pattern) do |match_text, username| if Feature.disabled?(:disable_all_mention) && username == 'all' && !skip_project_check? - link_to_all(link_content: link_content) + link_to_all(link_content_html: link_content_html) else - cached_call(:banzai_url_for_object, match, path: [User, username.downcase]) do + cached_call(:banzai_url_for_object, match_text, path: [User, username.downcase]) do # order is important: per-organization usernames should be checked before global namespace if org_user_detail = org_user_details[username.downcase] link_to_org_user_detail(org_user_detail) elsif namespace = namespaces[username.downcase] - link_to_namespace(namespace, link_content: link_content) || match - else - match + link_to_namespace(namespace, link_content_html: link_content_html) end end end @@ -111,56 +114,56 @@ def link_class [reference_class(:project_member, tooltip: false), "js-user-link"].join(" ") end - def link_to_all(link_content: nil) + def link_to_all(link_content_html: nil) author = context[:author] if author && !team_member?(author) - link_content + link_content_html else - parent_url(link_content, author) + parent_url(link_content_html, author) end end - def link_to_namespace(namespace, link_content: nil) + def link_to_namespace(namespace, link_content_html: nil) if namespace.is_a?(Group) - link_to_group(namespace.full_path, namespace, link_content: link_content) + link_to_group(namespace.full_path, namespace, link_content_html: link_content_html) else - link_to_user(namespace.path, namespace, link_content: link_content) + link_to_user(namespace.path, namespace, link_content_html: link_content_html) end end - def link_to_group(group, namespace, link_content: nil) + def link_to_group(group, namespace, link_content_html: nil) url = urls.group_url(group, only_path: context[:only_path]) data = data_attribute(group: namespace.id) - content = link_content || (Group.reference_prefix + group) + html_content = link_content_html || CGI.escapeHTML(Group.reference_prefix + group) - link_tag(url, data, content, namespace.full_name) + link_tag(url, data, html_content, namespace.full_name) end - def link_to_user(user, namespace, link_content: nil) + def link_to_user(user, namespace, link_content_html: nil) url = urls.user_url(user, only_path: context[:only_path]) data = data_attribute(user: namespace.owner_id) - content = link_content || (User.reference_prefix + user) + html_content = link_content_html || CGI.escapeHTML(User.reference_prefix + user) - link_tag(url, data, content, namespace.owner_name) + link_tag(url, data, html_content, namespace.owner_name) end - def link_to_org_user_detail(org_user_detail, link_content: nil) + def link_to_org_user_detail(org_user_detail, link_content_html: nil) user = org_user_detail.user url = urls.user_url(user, only_path: context[:only_path]) data = data_attribute(user: user.id) - content = link_content || org_user_detail.to_reference + html_content = link_content_html || CGI.escapeHTML(org_user_detail.to_reference) - link_tag(url, data, content, org_user_detail.username) + link_tag(url, data, html_content, org_user_detail.username) end - def link_tag(url, data, link_content, title) + def link_tag(url, data, link_content_html, title) write_opening_tag("a", { "href" => url, "title" => title, "class" => link_class, **data - }) << link_content << "" + }) << link_content_html << "" end def organization @@ -182,7 +185,7 @@ def team_member?(user) parent.member?(user) end - def parent_url(link_content, author) + def parent_url(link_content_html, author) if parent_group? url = urls.group_url(parent, only_path: context[:only_path]) data = data_attribute(group: group.id, author: author.try(:id)) @@ -191,8 +194,8 @@ def parent_url(link_content, author) data = data_attribute(project: project.id, author: author.try(:id)) end - content = link_content || (User.reference_prefix + 'all') - link_tag(url, data, content, 'All Project and Group Members') + html_content = link_content_html || CGI.escapeHTML(User.reference_prefix + 'all') + link_tag(url, data, html_content, 'All Project and Group Members') end end end diff --git a/lib/banzai/reference_redactor.rb b/lib/banzai/reference_redactor.rb index 48000185c9bf69..8a8b7714533d81 100644 --- a/lib/banzai/reference_redactor.rb +++ b/lib/banzai/reference_redactor.rb @@ -65,7 +65,6 @@ def redact_document_nodes(all_document_nodes) # def redacted_node_content(node) original_content = node.attr('data-original') - original_content = CGI.escape_html(original_content) if original_content # Build the raw tag just with a link as href and content if # it's originally a link pattern. We shouldn't return a plain text href. diff --git a/lib/gitlab/untrusted_regexp.rb b/lib/gitlab/untrusted_regexp.rb index 07c7ab31050a78..006681654f315b 100644 --- a/lib/gitlab/untrusted_regexp.rb +++ b/lib/gitlab/untrusted_regexp.rb @@ -40,6 +40,8 @@ def replace_all(text, rewrite) # There is no built-in replace with block support (like `gsub`). We can accomplish # the same thing by parsing and rebuilding the string with the substitutions. def replace_gsub(text, limit: 0) + return enum_for(:replace_gsub, text, limit:) unless block_given? + new_text = +'' remainder = text count = 0 diff --git a/lib/gitlab/utils/gsub.rb b/lib/gitlab/utils/gsub.rb index 29fcf31968d0dc..4c59b39bac3ad8 100644 --- a/lib/gitlab/utils/gsub.rb +++ b/lib/gitlab/utils/gsub.rb @@ -9,6 +9,8 @@ module Gsub # allows us to break out of the replacement loop when the limit is reached. # This is the same algorithm used for Gitlab::UntrustedRegexp.replace_gsub def gsub_with_limit(text, pattern, limit:) + return enum_for(:gsub_with_limit, text, pattern, limit:) unless block_given? + new_text = +'' remainder = text count = 0 diff --git a/spec/lib/banzai/filter/references/abstract_reference_filter_spec.rb b/spec/lib/banzai/filter/references/abstract_reference_filter_spec.rb index a2e0d92328d729..cbbb34c93fa5f4 100644 --- a/spec/lib/banzai/filter/references/abstract_reference_filter_spec.rb +++ b/spec/lib/banzai/filter/references/abstract_reference_filter_spec.rb @@ -29,7 +29,7 @@ it 'uses gsub_with_limit' do allow(described_class).to receive(:object_class).and_return(Issue) - expect(Gitlab::Utils::Gsub).to receive(:gsub_with_limit).with(anything, anything, limit: Banzai::Filter::FILTER_ITEM_LIMIT).and_call_original + expect(Gitlab::Utils::Gsub).to receive(:gsub_with_limit).with(anything, anything, limit: Banzai::Filter::FILTER_ITEM_LIMIT).twice.and_call_original filter_instance.references_in('text') end @@ -52,4 +52,33 @@ it_behaves_like 'a filter timeout' do let(:text) { 'text' } end + + describe '#references_in' do + let(:reference) { %(~"my #{range.to_reference}" } end + + it_behaves_like 'ReferenceFilter#references_in' do + let(:reference) { range.to_reference } + let(:filter_instance) { described_class.new(nil, { project: nil }) } + end end diff --git a/spec/lib/banzai/filter/references/commit_reference_filter_spec.rb b/spec/lib/banzai/filter/references/commit_reference_filter_spec.rb index c08da88f51efe3..031ce55baf398f 100644 --- a/spec/lib/banzai/filter/references/commit_reference_filter_spec.rb +++ b/spec/lib/banzai/filter/references/commit_reference_filter_spec.rb @@ -345,4 +345,9 @@ let(:text) { "#{commit.id} #{commit.id} #{commit.id}" } let(:ends_with) { " #{commit.id}" } end + + it_behaves_like 'ReferenceFilter#references_in' do + let(:reference) { commit.id } + let(:filter_instance) { described_class.new(nil, { project: }) } + end end diff --git a/spec/lib/banzai/filter/references/external_issue_reference_filter_spec.rb b/spec/lib/banzai/filter/references/external_issue_reference_filter_spec.rb index 2238d1e5408e8a..36b55060ce5b2e 100644 --- a/spec/lib/banzai/filter/references/external_issue_reference_filter_spec.rb +++ b/spec/lib/banzai/filter/references/external_issue_reference_filter_spec.rb @@ -373,4 +373,30 @@ def filter(markdown, context = {}) expect { reference_filter(multiple_references).to_html }.not_to exceed_query_limit(control) end end + + context "pattern is Regexp" do + it_behaves_like 'ReferenceFilter#references_in' do + let_it_be(:integration) { create(:redmine_integration, project: project) } + + let(:issue) { ExternalIssue.new("T-123", project) } + let(:reference) { issue.to_reference } + + let(:filter_instance) { described_class.new(nil, { project: }) } + end + end + + context "pattern is Gitlab::UntrustedRegexp" do + it_behaves_like 'ReferenceFilter#references_in' do + let_it_be(:integration) { create(:redmine_integration, project: project) } + + let(:issue) { ExternalIssue.new("T-123", project) } + let(:reference) { issue.to_reference } + + before do + allow_any_instance_of(described_class).to receive(:object_reference_pattern).and_return(Gitlab::UntrustedRegexp.new("\\b[A-Z][A-Z0-9_]*-(?\\d{1,20})")) + end + + let(:filter_instance) { described_class.new(nil, { project: }) } + end + end end diff --git a/spec/lib/banzai/filter/references/label_reference_filter_spec.rb b/spec/lib/banzai/filter/references/label_reference_filter_spec.rb index 4568522641fc9c..55b1a1da07c226 100644 --- a/spec/lib/banzai/filter/references/label_reference_filter_spec.rb +++ b/spec/lib/banzai/filter/references/label_reference_filter_spec.rb @@ -923,4 +923,22 @@ end.not_to exceed_all_query_limit(max_count) end end + + it_behaves_like 'a reference which does not unescape its content in data-original' do + let(:context) { { project: project } } + let(:label_title) { "x<script>alert('xss');</script>" } + let(:resource) { create(:label, title: label_title, project: project) } + let(:reference) { %(#{resource.class.reference_prefix}"#{label_title}") } + + # This is probably bad (why are we unescaping entities in the input title like this?), + # but it is the case today, and we need to be safe in spite of this. + let(:expected_resource_title) { "x" } + + let(:expected_href) { urls.project_issues_url(project, label_name: expected_resource_title) } + let(:expected_replacement_text) { expected_resource_title } + end + + it_behaves_like 'ReferenceFilter#references_in' do + let(:filter_instance) { described_class.new(nil, { project: }) } + end end diff --git a/spec/lib/banzai/filter/references/milestone_reference_filter_spec.rb b/spec/lib/banzai/filter/references/milestone_reference_filter_spec.rb index 37269a0c675a3e..7675eacf0cd6f1 100644 --- a/spec/lib/banzai/filter/references/milestone_reference_filter_spec.rb +++ b/spec/lib/banzai/filter/references/milestone_reference_filter_spec.rb @@ -340,7 +340,7 @@ result = reference_filter("See #{absolute_reference}") expect(result.css('a').first.attr('href')).to eq(urls.milestone_url(milestone)) - expect(result.css('a').first.attr('data-original')).to eq(absolute_reference) + expect(result.css('a').first.attr('data-original')).to eq_html(absolute_reference) expect(result.content).to eq "See %#{milestone.title}" end end @@ -627,4 +627,24 @@ end.not_to exceed_all_query_limit(control_count) end end + + it_behaves_like 'a reference which does not unescape its content in data-original' do + let(:context) { { project: project } } + let(:milestone_title) { "x<script>alert('xss');</script>" } + let(:resource) { create(:milestone, title: milestone_title, project: project) } + let(:reference) { %(#{resource.class.reference_prefix}"#{milestone_title}") } + + # This is probably bad (why are we unescaping entities in the input title like this?), + # but it is the case today, and we need to be safe in spite of this. + let(:expected_resource_title) { "x" } + + let(:expected_href) { urls.milestone_url(resource) } + let(:expected_replacement_text) { %(#{resource.class.reference_prefix}#{expected_resource_title}) } + end + + it_behaves_like 'ReferenceFilter#references_in' do + let(:milestone) { create(:milestone, project: project) } + let(:reference) { milestone.to_reference } + let(:filter_instance) { described_class.new(nil, { project: }) } + end end diff --git a/spec/lib/banzai/filter/references/project_reference_filter_spec.rb b/spec/lib/banzai/filter/references/project_reference_filter_spec.rb index 1539af7d32d5bc..5814689d970eea 100644 --- a/spec/lib/banzai/filter/references/project_reference_filter_spec.rb +++ b/spec/lib/banzai/filter/references/project_reference_filter_spec.rb @@ -133,4 +133,8 @@ def get_reference(project) let(:text) { "#{reference} #{reference} #{reference}" } let(:ends_with) { " #{CGI.escapeHTML(reference)}" } end + + it_behaves_like 'ReferenceFilter#references_in' do + let(:filter_instance) { described_class.new(nil, { project: nil }) } + end end diff --git a/spec/lib/banzai/filter/references/reference_filter_spec.rb b/spec/lib/banzai/filter/references/reference_filter_spec.rb index 075a2e56851c7a..60eca79c64a1c5 100644 --- a/spec/lib/banzai/filter/references/reference_filter_spec.rb +++ b/spec/lib/banzai/filter/references/reference_filter_spec.rb @@ -81,13 +81,15 @@ RSpec.shared_examples 'replaces text' do |method_name, index| let(:args) { [filter.nodes[index], index, ref_pattern || href_link].compact } + def call_method(method_name, replacement) + filter.send(method_name, *args) { replacement } + end + context 'when content didnt change' do it 'does not replace link node with html' do - filter.send(method_name, *args) do - existing_content - end + call_method(method_name, existing_content) - expect(filter).not_to receive(:replace_text_with_html) + expect(filter).not_to receive(:replace_node_with_html) end end @@ -95,25 +97,19 @@ let(:html) { %(text Reference) } it 'replaces reference node' do - filter.send(method_name, *args) do - html - end + call_method(method_name, html) expect(document.css('a').length).to eq 1 end - it 'calls replace_and_update_new_nodes' do - expect(filter).to receive(:replace_and_update_new_nodes).with(filter.nodes[index], index, html) + it 'calls replace_node_with_html' do + expect(filter).to receive(:replace_node_with_html).with(filter.nodes[index], index, html) - filter.send(method_name, *args) do - html - end + call_method(method_name, html) end it 'stores filtered new nodes' do - filter.send(method_name, *args) do - html - end + call_method(method_name, html) expect(filter.instance_variable_get(:@new_nodes)).to eq({ index => [filter.each_node.to_a[index]] }) end @@ -151,7 +147,7 @@ end end - describe '#replace_text_when_pattern_matches' do + describe '#replace_node_when_text_matches' do include_context 'document nodes' let(:node) { Nokogiri::HTML.fragment('text @reference') } @@ -162,34 +158,15 @@ let(:nodes) { [node] } it 'skips node' do - expect { |b| filter.send(:replace_text_when_pattern_matches, filter.nodes[0], 0, ref_pattern, &b) }.not_to yield_control + expect { |b| filter.send(:replace_node_when_text_matches, filter.nodes[0], 0, ref_pattern, &b) }.not_to yield_control end end - it_behaves_like 'replaces document node', :replace_text_when_pattern_matches do + it_behaves_like 'replaces document node', :replace_node_when_text_matches do let(:existing_content) { node.to_html } end end - describe '#replace_link_node_with_text' do - include_context 'document nodes' - let(:node) { Nokogiri::HTML.fragment('end text') } - - it_behaves_like 'replaces document node', :replace_link_node_with_text do - let(:existing_content) { node.text } - end - end - - describe '#replace_link_node_with_href' do - include_context 'document nodes' - let(:node) { Nokogiri::HTML.fragment('end text') } - let(:href_link) { CGI.unescape(node.attr('href').to_s) } - - it_behaves_like 'replaces document node', :replace_link_node_with_href do - let(:existing_content) { href_link } - end - end - describe '#call_and_update_nodes' do include_context 'new nodes' let(:document) { Nokogiri::HTML.fragment('foo') } diff --git a/spec/lib/banzai/filter/references/user_reference_filter_spec.rb b/spec/lib/banzai/filter/references/user_reference_filter_spec.rb index b9ca92dc9b91d0..1d9af606723dbe 100644 --- a/spec/lib/banzai/filter/references/user_reference_filter_spec.rb +++ b/spec/lib/banzai/filter/references/user_reference_filter_spec.rb @@ -322,4 +322,8 @@ def get_reference(user) let(:text) { "#{reference} #{reference} #{reference}" } let(:ends_with) { " #{reference}" } end + + it_behaves_like 'ReferenceFilter#references_in' do + let(:filter_instance) { described_class.new(nil, { project: nil }) } + end end diff --git a/spec/lib/banzai/pipeline/full_pipeline_spec.rb b/spec/lib/banzai/pipeline/full_pipeline_spec.rb index 73225d8fd261ce..7b7360bcc5ffc9 100644 --- a/spec/lib/banzai/pipeline/full_pipeline_spec.rb +++ b/spec/lib/banzai/pipeline/full_pipeline_spec.rb @@ -43,9 +43,36 @@ markdown.delete!("\n") - result = described_class.to_html(markdown, project: project) + # The PlainMarkdownPipeline yields the following DOM (newlines and shape added for clarity): + # + #
+ # + # http://localhost/namespace1/project-1/-/issues/1 + # + # + # + # + #
+ # + result = described_class.to_html(markdown, project: project) expect(result).to include "" + + # The result at the end of GfmPipeline is as follows (again, pretty-printed): + # + #
+ # + # #1 + # + # + # + # + #
end it 'escapes the data-original attribute on a reference' do diff --git a/spec/lib/banzai/reference_redactor_spec.rb b/spec/lib/banzai/reference_redactor_spec.rb index b7d991553dcf4c..137e93f3b7aa84 100644 --- a/spec/lib/banzai/reference_redactor_spec.rb +++ b/spec/lib/banzai/reference_redactor_spec.rb @@ -35,14 +35,18 @@ end context 'when data-original attribute provided' do - let(:original_content) { '<script>alert(1);</script>' } - - it 'replaces redacted reference with original content' do - doc = Nokogiri::HTML.fragment("bar") - redactor.redact([doc]) - expect(doc.to_html).to eq(original_content) + RSpec.shared_examples 'substitutes original content back in as-is' do |original_content| + it 'replaces redacted reference with original content' do + doc = Nokogiri::HTML.fragment("bar") + doc.css('a')[0]["data-original"] = original_content + redactor.redact([doc]) + expect(doc.to_html).to eq(original_content) + end end + it_behaves_like 'substitutes original content back in as-is', '<script>alert(1);</script>' + it_behaves_like 'substitutes original content back in as-is', 'foo' + it 'does not replace redacted reference with original content if href is given' do html = "Marge" doc = Nokogiri::HTML.fragment(html) diff --git a/spec/mailers/emails/service_desk_spec.rb b/spec/mailers/emails/service_desk_spec.rb index e12a3b61001a77..c967e8657a4295 100644 --- a/spec/mailers/emails/service_desk_spec.rb +++ b/spec/mailers/emails/service_desk_spec.rb @@ -342,12 +342,11 @@ context 'with template' do context 'with all-user reference in a an external author comment' do let_it_be(:note) { create(:note_on_issue, noteable: issue, project: project, note: "Hey @all, just a ping", author: Users::Internal.support_bot) } + let(:expected_template_html) { 'Hey @all, just a ping' } let(:template_content) { 'some text %{ NOTE_TEXT }' } context 'when `disable_all_mention` is disabled' do - let(:expected_template_html) { 'Hey , just a ping' } - before do stub_feature_flags(disable_all_mention: false) end @@ -356,8 +355,6 @@ end context 'when `disable_all_mention` is enabled' do - let(:expected_template_html) { 'Hey @all, just a ping' } - before do stub_feature_flags(disable_all_mention: true) end diff --git a/spec/requests/projects/incident_management/timeline_events_spec.rb b/spec/requests/projects/incident_management/timeline_events_spec.rb index 8b866dcea5dda7..587eb558d7959c 100644 --- a/spec/requests/projects/incident_management/timeline_events_spec.rb +++ b/spec/requests/projects/incident_management/timeline_events_spec.rb @@ -38,8 +38,9 @@ params: { text: timeline_text } expect(response).to have_gitlab_http_status(:ok) + expect(json_response["body"]).to eq_html(expected_body) expect(json_response).to eq({ - body: expected_body, + body: json_response["body"], references: { commands: '', suggestions: [], diff --git a/spec/support/matchers/eq_html.rb b/spec/support/matchers/eq_html.rb new file mode 100644 index 00000000000000..5bdaa05d43b25a --- /dev/null +++ b/spec/support/matchers/eq_html.rb @@ -0,0 +1,82 @@ +# frozen_string_literal: true + +# Assert that the two values are the same in HTML context, modulo escaping operations that +# do not change the meaning of the value --- i.e. do not change how a web browser or HTML parser +# would treat them. +# +# This means, e.g. '"' and '"' are the same when encountered outside an HTML tag, +# but '<b>' and '' are not. +# +# Likewise, '' and +# "" are considered the same. +# +# This matcher helps tests be less brittle or flakey in the face of changing attribute orders, +# and less repetitive in the face of comparing essentially unchanged values that nonetheless +# are represented differently. +# +# Note that it uses a real HTML parser internally so as not to introduce security issues +# through accidental de-escaping or any of the other myriad ways HTML can be messed with. +RSpec::Matchers.define :eq_html do |expected| + match do |actual| + raise ArgumentError, "expected should be a String, not #{expected.class}" unless expected.is_a?(String) + raise ArgumentError, "actual should be a String, not #{actual.class}" unless actual.is_a?(String) + + EqHtmlMatcher.normalize_html(actual) == EqHtmlMatcher.normalize_html(expected) + end + + failure_message do |actual| + "Expected that\n\n #{actual}\n\nwould normalize to the same HTML as\n\n #{expected}\n\nbut it didn't.\n\n " \ + "#{EqHtmlMatcher.normalize_html(actual).inspect}\n!=\n #{EqHtmlMatcher.normalize_html(expected).inspect}" + end + + failure_message_when_negated do |actual| + "Expected that\n\n #{actual}\n\nwould NOT normalize to the same HTML as\n\n #{expected}\n\nbut it did!\n\n " \ + "#{EqHtmlMatcher.normalize_html(actual).inspect}\n==\n #{EqHtmlMatcher.normalize_html(expected).inspect}" + end +end + +# Assert the right-hand value is contained in the left-hand value after normalising HTML on both sides per eq_html. +# See eq_html's docstring for details on what this means. +# +# Note this implies you can only check for inclusion of bare text (outside tags) and whole tags; a +# partial tag will normalise to text ("" +# includes the HTML "node
inside' } - let(:reference_with_element) { %(#{inner_html}) } - - it 'does not escape inner html' do - doc = reference_filter(reference_with_element, try(:context) || {}) - - expect(doc.children.first.children.first.inner_html).to eq(inner_html) - end -end - -# Requires a reference, subject and subject_name: -# subject { create(:user) } -# let(:reference) { subject.to_reference } -# let(:subject_name) { 'user' } -RSpec.shared_examples 'user reference or project reference' do - shared_examples 'it contains a data- attribute' do - it 'includes a data- attribute' do - doc = reference_filter("Hey #{reference}") - link = doc.css('a').first - - expect(link).to have_attribute("data-#{subject_name}") - expect(link.attr("data-#{subject_name}")).to eq subject.id.to_s - end - end - - context 'when mentioning a resource' do - it_behaves_like 'a reference containing an element node' - it_behaves_like 'it contains a data- attribute' - - it "links to a resource" do - doc = reference_filter("Hey #{reference}") - expect(doc.css('a').first.attr('href')).to eq urls.send("#{subject_name}_url", subject) - end - - it 'links to a resource with a period' do - subject = create(subject_name.to_sym, name: 'alphA.Beta') - - doc = reference_filter("Hey #{get_reference(subject)}") - expect(doc.css('a').length).to eq 1 - end - - it 'links to a resource with an underscore' do - subject = create(subject_name.to_sym, name: 'ping_pong_king') - - doc = reference_filter("Hey #{get_reference(subject)}") - expect(doc.css('a').length).to eq 1 - end - - it 'links to a resource with different case-sensitivity' do - subject = create(subject_name.to_sym, name: 'RescueRanger') - reference = get_reference(subject) - - doc = reference_filter("Hey #{reference.upcase}") - expect(doc.css('a').length).to eq 1 - expect(doc.css('a').text).to eq(reference) - end - end - - it 'supports an :only_path context' do - doc = reference_filter("Hey #{reference}", only_path: true) - link = doc.css('a').first.attr('href') - - expect(link).not_to match %r{https?://} - expect(link).to eq urls.send "#{subject_name}_path", subject - end - - describe 'referencing a resource in a link href' do - let(:reference) { %(Some text) } - - it_behaves_like 'it contains a data- attribute' - - it 'links to the resource' do - doc = reference_filter("Hey #{reference}") - expect(doc.css('a').first.attr('href')).to eq urls.send "#{subject_name}_url", subject - end - - it 'links with adjacent text' do - doc = reference_filter("Mention me (#{reference}.)") - expect(doc.to_html).to match(%r{\(Some text\.\)}) - end - end -end diff --git a/spec/support/shared_examples/banzai/filters/emoji_shared_examples.rb b/spec/support/shared_examples/lib/banzai/filters/emoji_shared_examples.rb similarity index 100% rename from spec/support/shared_examples/banzai/filters/emoji_shared_examples.rb rename to spec/support/shared_examples/lib/banzai/filters/emoji_shared_examples.rb diff --git a/spec/support/shared_examples/banzai/filters/filter_timeout_shared_examples.rb b/spec/support/shared_examples/lib/banzai/filters/filter_timeout_shared_examples.rb similarity index 100% rename from spec/support/shared_examples/banzai/filters/filter_timeout_shared_examples.rb rename to spec/support/shared_examples/lib/banzai/filters/filter_timeout_shared_examples.rb diff --git a/spec/support/shared_examples/lib/banzai/filters/reference_filter_shared_examples.rb b/spec/support/shared_examples/lib/banzai/filters/reference_filter_shared_examples.rb index ac96034f7e021f..d6163b19dac9ea 100644 --- a/spec/support/shared_examples/lib/banzai/filters/reference_filter_shared_examples.rb +++ b/spec/support/shared_examples/lib/banzai/filters/reference_filter_shared_examples.rb @@ -1,5 +1,93 @@ # frozen_string_literal: true +# Specs for reference links containing HTML. +# +# Requires a reference: +# let(:reference) { '#42' } +RSpec.shared_examples 'a reference containing an element node' do + let(:inner_html) { 'element node inside' } + let(:reference_with_element) { %(#{inner_html}) } + + it 'does not escape inner html' do + doc = reference_filter(reference_with_element, try(:context) || {}) + + expect(doc.children.first.children.first.inner_html).to eq(inner_html) + end +end + +# Requires a reference, subject and subject_name: +# subject { create(:user) } +# let(:reference) { subject.to_reference } +# let(:subject_name) { 'user' } +RSpec.shared_examples 'user reference or project reference' do + shared_examples 'it contains a data- attribute' do + it 'includes a data- attribute' do + doc = reference_filter("Hey #{reference}") + link = doc.css('a').first + + expect(link).to have_attribute("data-#{subject_name}") + expect(link.attr("data-#{subject_name}")).to eq subject.id.to_s + end + end + + context 'when mentioning a resource' do + it_behaves_like 'a reference containing an element node' + it_behaves_like 'it contains a data- attribute' + + it "links to a resource" do + doc = reference_filter("Hey #{reference}") + expect(doc.css('a').first.attr('href')).to eq urls.send(:"#{subject_name}_url", subject) + end + + it 'links to a resource with a period' do + subject = create(subject_name.to_sym, name: 'alphA.Beta') + + doc = reference_filter("Hey #{get_reference(subject)}") + expect(doc.css('a').length).to eq 1 + end + + it 'links to a resource with an underscore' do + subject = create(subject_name.to_sym, name: 'ping_pong_king') + + doc = reference_filter("Hey #{get_reference(subject)}") + expect(doc.css('a').length).to eq 1 + end + + it 'links to a resource with different case-sensitivity' do + subject = create(subject_name.to_sym, name: 'RescueRanger') + reference = get_reference(subject) + + doc = reference_filter("Hey #{reference.upcase}") + expect(doc.css('a').length).to eq 1 + expect(doc.css('a').text).to eq(reference) + end + end + + it 'supports an :only_path context' do + doc = reference_filter("Hey #{reference}", only_path: true) + link = doc.css('a').first.attr('href') + + expect(link).not_to match %r{https?://} + expect(link).to eq urls.send :"#{subject_name}_path", subject + end + + describe 'referencing a resource in a link href' do + let(:reference) { %(Some text) } + + it_behaves_like 'it contains a data- attribute' + + it 'links to the resource' do + doc = reference_filter("Hey #{reference}") + expect(doc.css('a').first.attr('href')).to eq urls.send :"#{subject_name}_url", subject + end + + it 'links with adjacent text' do + doc = reference_filter("Mention me (#{reference}.)") + expect(doc.to_html).to match(%r{\(Some text\.\)}) + end + end +end + RSpec.shared_examples 'HTML text with references' do let(:markdown_prepend) { "<img src=\"\" onerror=alert('bug')>" } @@ -22,3 +110,69 @@ expect(doc.to_html).to include text end end + +RSpec.shared_examples 'a reference which does not unescape its content in data-original' do + it 'does not remove a layer of escaping in data-original' do + # We assert the expected resource.title so calling `it_behaves_like` blocks can better describe + # the known behaviour around the resource. Some classes process their title in ways that are + # particularly relevant to XSS; the big example is Labels, which unescape HTML entities in + # their input. It's helpful to call out when this happens (and when this doesn't) so the rest + # of the behaviours are clear. + expect(resource.title).to eq(expected_resource_title) + + result = reference_filter("See #{reference}", context) + + expect(result.css('a').first.attr('href')).to eq(expected_href) + + # This is the important part. + expect(result.css('a').first.attr('data-original')).to eq_html(reference) + + expect(result.content).to eq "See #{expected_replacement_text}" + end +end + +# Expects an input reference, and a filter instance to call #references_in on: +# let(:reference) { range.to_reference } +# let(:filter_instance) { described_class.new(nil, { project: nil }) } +# +# Optionally, you can override the expected replacement by setting `:expected_replacement`. +# In particular, if no replacement is expected, say: +# let(:expected_replacement) { nil } +# +# See the doc comment on ReferenceFilter#references_in for a detailed discussion on +# this function's input, block, and output. +RSpec.shared_examples_for 'ReferenceFilter#references_in' do + let(:replacement) do + doc = Nokogiri::HTML.fragment("link to label") + doc.css('a').first['href'] = '/destination' + doc.to_html + end + + let(:expected_replacement) { replacement } + + it "does not return input text unescaped" do + text = "don't unescape #{reference} <anything>" + + html = filter_instance.references_in(text) do |match| + # Special case: ProjectReferenceFilter will match "b>" and "/b>" from the above text. + # This is correct. Let it pass through, translating text to HTML, like it normally would. + # + # Extremely special case: it'll also match "anything>", though hopefully this will + # become less necessary in the future. This is incorrect, but currently the case. + # See Project.reference_postfix_escaped. + next CGI.escapeHTML(match) if match == "b>" || match == "/b>" || match == "anything>" + + expect(match).to eq(reference) + replacement + end + + case expected_replacement + when NilClass + expect(html).to be_nil + when String + expect(html).to eq_html("don't <b>unescape</b> #{expected_replacement} &lt;anything&gt;") + else + raise ArgumentError, "expected_replacement should be nil or a String, not #{expected_replacement.class}" + end + end +end diff --git a/spec/support_specs/matchers/match_html_spec.rb b/spec/support_specs/matchers/match_html_spec.rb new file mode 100644 index 00000000000000..a8d42ad3023b36 --- /dev/null +++ b/spec/support_specs/matchers/match_html_spec.rb @@ -0,0 +1,25 @@ +# frozen_string_literal: true + +require 'fast_spec_helper' +require_relative '../../support/matchers/eq_html' + +# rubocop: disable RSpec/ExpectActual -- these *are* the actual values we're testing. +RSpec.describe "eq_html", feature_category: :shared do + it "normalizes entities" do + expect(%( x < y + "1"&'2' )).to eq_html(%( x < y + "1"&'2' )) + end + + it "doesn't mix up entities" do + expect(%( no )).not_to eq_html(%( <em>no</em> )) + end + + it "normalizes attributes" do + expect(%( Click )).to \ + eq_html(%( Click )) + end + + it "doesn't confuse similar attributes" do + expect(%( Click )).not_to eq_html(%( Click )) + end +end +# rubocop: enable RSpec/ExpectActual -- GitLab From ba0b2ef5c24d2143906e2b2b44d0e6230abf2b46 Mon Sep 17 00:00:00 2001 From: Asherah Connor Date: Tue, 21 Oct 2025 14:09:09 +1100 Subject: [PATCH 2/3] Specs for unknown argument cases These are defensive cases, intended to give other contributors a better experience than `NoMethodError` when unexpected types are used. --- .../external_issue_reference_filter.rb | 2 +- .../external_issue_reference_filter_spec.rb | 35 ++++++++++--------- .../references/reference_filter_spec.rb | 15 +++++++- 3 files changed, 34 insertions(+), 18 deletions(-) diff --git a/lib/banzai/filter/references/external_issue_reference_filter.rb b/lib/banzai/filter/references/external_issue_reference_filter.rb index 4c36ee373807d7..d376356a05dabf 100644 --- a/lib/banzai/filter/references/external_issue_reference_filter.rb +++ b/lib/banzai/filter/references/external_issue_reference_filter.rb @@ -43,7 +43,7 @@ def references_in(text, pattern = object_reference_pattern) when Gitlab::UntrustedRegexp pattern.replace_gsub(text, limit: Banzai::Filter::FILTER_ITEM_LIMIT) else - return + raise ArgumentError, "#{self.class.name} given #{pattern.class.name} pattern; should be Regexp or Gitlab::UntrustedRegexp" end replace_references_in_text_with_html(enumerator) do |match_data| diff --git a/spec/lib/banzai/filter/references/external_issue_reference_filter_spec.rb b/spec/lib/banzai/filter/references/external_issue_reference_filter_spec.rb index 36b55060ce5b2e..b8d6fab27a5b6f 100644 --- a/spec/lib/banzai/filter/references/external_issue_reference_filter_spec.rb +++ b/spec/lib/banzai/filter/references/external_issue_reference_filter_spec.rb @@ -374,29 +374,32 @@ def filter(markdown, context = {}) end end - context "pattern is Regexp" do - it_behaves_like 'ReferenceFilter#references_in' do - let_it_be(:integration) { create(:redmine_integration, project: project) } + context 'ReferenceFilter#references_in' do + let(:issue) { ExternalIssue.new("T-123", project) } + let(:reference) { issue.to_reference } - let(:issue) { ExternalIssue.new("T-123", project) } - let(:reference) { issue.to_reference } + let(:filter_instance) { described_class.new(nil, { project: }) } - let(:filter_instance) { described_class.new(nil, { project: }) } + context "pattern is Regexp" do + it_behaves_like 'ReferenceFilter#references_in' do + let_it_be(:integration) { create(:redmine_integration, project: project) } + end end - end - - context "pattern is Gitlab::UntrustedRegexp" do - it_behaves_like 'ReferenceFilter#references_in' do - let_it_be(:integration) { create(:redmine_integration, project: project) } - let(:issue) { ExternalIssue.new("T-123", project) } - let(:reference) { issue.to_reference } + context "pattern is Gitlab::UntrustedRegexp" do + it_behaves_like 'ReferenceFilter#references_in' do + let_it_be(:integration) { create(:redmine_integration, project: project) } - before do - allow_any_instance_of(described_class).to receive(:object_reference_pattern).and_return(Gitlab::UntrustedRegexp.new("\\b[A-Z][A-Z0-9_]*-(?\\d{1,20})")) + before do + allow_any_instance_of(described_class).to receive(:object_reference_pattern).and_return(Gitlab::UntrustedRegexp.new("\\b[A-Z][A-Z0-9_]*-(?\\d{1,20})")) + end end + end - let(:filter_instance) { described_class.new(nil, { project: }) } + context "pattern is unknown type" do + it 'raises ArgumentError' do + expect { filter_instance.references_in("some text", Object.new) }.to raise_error(ArgumentError) + end end end end diff --git a/spec/lib/banzai/filter/references/reference_filter_spec.rb b/spec/lib/banzai/filter/references/reference_filter_spec.rb index 60eca79c64a1c5..7065d70160b8b6 100644 --- a/spec/lib/banzai/filter/references/reference_filter_spec.rb +++ b/spec/lib/banzai/filter/references/reference_filter_spec.rb @@ -207,7 +207,20 @@ def call_method(method_name, replacement) end end - describe '.nodes?' do + describe '#replace_references_in_text_with_html' do + let(:document) { Nokogiri::HTML.fragment('foo') } + let(:filter) { described_class.new(document, project:) } + + context 'enumerator yields non-MatchData' do + let(:weird_enumerator) { ["weh"].each } + + it 'raises ArgumentError' do + expect { filter.replace_references_in_text_with_html(weird_enumerator) }.to raise_error(ArgumentError) + end + end + end + + describe '#nodes?' do let_it_be(:document) { Nokogiri::HTML.fragment('foo') } let(:filter) { described_class.new(document, project: project) } -- GitLab From b9a43a191662bf60a94b60a2799c282a025662f3 Mon Sep 17 00:00:00 2001 From: Asherah Connor Date: Wed, 22 Oct 2025 15:18:30 +1100 Subject: [PATCH 3/3] Change some comments into assertions --- .../lib/banzai/pipeline/full_pipeline_spec.rb | 57 +++++++++++-------- spec/support/matchers/eq_html.rb | 38 +++++++++---- 2 files changed, 59 insertions(+), 36 deletions(-) diff --git a/spec/lib/banzai/pipeline/full_pipeline_spec.rb b/spec/lib/banzai/pipeline/full_pipeline_spec.rb index 7b7360bcc5ffc9..22a8cd0ce7e63e 100644 --- a/spec/lib/banzai/pipeline/full_pipeline_spec.rb +++ b/spec/lib/banzai/pipeline/full_pipeline_spec.rb @@ -43,36 +43,45 @@ markdown.delete!("\n") - # The PlainMarkdownPipeline yields the following DOM (newlines and shape added for clarity): - # - # + # Part of understanding this spec means knowing how the above is actually treated + # by the FullPipeline. The PlainMarkdownPipeline runs first and produces the DOM + # which is actually operated on by the GfmPipeline. # + # Here we recreate the PlainMarkdownPipeline's effects by themselves and assert the + # result, to clarify what the malicious input is interpreted as: + + html_after_plain_markdown_pipeline = Banzai::Pipeline::PlainMarkdownPipeline.to_html(markdown, project:) + expect(html_after_plain_markdown_pipeline).to eq_html(<<-HTML, trim_text_nodes: true) + + HTML result = described_class.to_html(markdown, project: project) expect(result).to include "" - # The result at the end of GfmPipeline is as follows (again, pretty-printed): + # As above, we assert the full result to clarify the full result. We used to interpret + # this input as a valid reference link, which eventually lead to the original XSS whose + # fix this spec initially accompanied. # - #
- # - # #1 - # - # - # - # - #
+ # We no longer do; we do not compare the full inner HTML of the link with the href + # attribute, but only its textual contents. + expect(result).to eq_html(<<-HTML, trim_text_nodes: true) + + HTML end it 'escapes the data-original attribute on a reference' do diff --git a/spec/support/matchers/eq_html.rb b/spec/support/matchers/eq_html.rb index 5bdaa05d43b25a..ab1c9dcf846cc2 100644 --- a/spec/support/matchers/eq_html.rb +++ b/spec/support/matchers/eq_html.rb @@ -16,22 +16,32 @@ # # Note that it uses a real HTML parser internally so as not to introduce security issues # through accidental de-escaping or any of the other myriad ways HTML can be messed with. -RSpec::Matchers.define :eq_html do |expected| +# +# --- +# +# The option "trim_text_nodes: true" can be given. This trims whitespace at the start and +# end of the text content of text nodes on each side. This should be used with care, but can +# aid in spec clarity by allowing expected DOMs to be written "prettified". +RSpec::Matchers.define :eq_html do |expected, **normalize_opts| + include EqHtmlMatcher + match do |actual| raise ArgumentError, "expected should be a String, not #{expected.class}" unless expected.is_a?(String) raise ArgumentError, "actual should be a String, not #{actual.class}" unless actual.is_a?(String) - EqHtmlMatcher.normalize_html(actual) == EqHtmlMatcher.normalize_html(expected) + normalize_html(actual, **normalize_opts) == normalize_html(expected, **normalize_opts) end failure_message do |actual| "Expected that\n\n #{actual}\n\nwould normalize to the same HTML as\n\n #{expected}\n\nbut it didn't.\n\n " \ - "#{EqHtmlMatcher.normalize_html(actual).inspect}\n!=\n #{EqHtmlMatcher.normalize_html(expected).inspect}" + "#{normalize_html(actual, **normalize_opts).inspect}\n!=\n " \ + "#{normalize_html(expected, **normalize_opts).inspect}" end failure_message_when_negated do |actual| "Expected that\n\n #{actual}\n\nwould NOT normalize to the same HTML as\n\n #{expected}\n\nbut it did!\n\n " \ - "#{EqHtmlMatcher.normalize_html(actual).inspect}\n==\n #{EqHtmlMatcher.normalize_html(expected).inspect}" + "#{normalize_html(actual, **normalize_opts).inspect}\n==\n " \ + "#{normalize_html(expected, **normalize_opts).inspect}" end end @@ -43,33 +53,35 @@ # includes the HTML "