From 0f2aa00ef31346795952af06d4e19698a7aabc4f Mon Sep 17 00:00:00 2001 From: Valery Sizov Date: Wed, 7 Jun 2017 16:26:16 +0300 Subject: [PATCH 1/3] Elasticsearch only finds first occurrence in a file[ci skip] --- app/assets/stylesheets/pages/search.scss | 4 ++ app/helpers/blob_helper.rb | 9 ++- app/models/blob.rb | 4 ++ app/views/search/results/_blob.html.haml | 25 +------ app/views/search/results/_blob_es.html.haml | 23 +++++++ .../search/results/_blob_regular.html.haml | 23 +++++++ app/views/shared/_file_highlight.html.haml | 6 +- lib/gitlab/elastic/search_results.rb | 66 +++++++++---------- lib/gitlab/elastic/snippet_processor.rb | 34 ++++++++++ lib/gitlab/search_results.rb | 5 +- 10 files changed, 137 insertions(+), 62 deletions(-) create mode 100644 app/views/search/results/_blob_es.html.haml create mode 100644 app/views/search/results/_blob_regular.html.haml create mode 100644 lib/gitlab/elastic/snippet_processor.rb diff --git a/app/assets/stylesheets/pages/search.scss b/app/assets/stylesheets/pages/search.scss index b9818ffcf4241b..9b4d4f96339d5c 100644 --- a/app/assets/stylesheets/pages/search.scss +++ b/app/assets/stylesheets/pages/search.scss @@ -1,4 +1,8 @@ .search-results { + strong.highlighted-terms { + background-color: yellow; + } + .search-result-row { border-bottom: 1px solid $border-color; padding-bottom: $gl-padding; diff --git a/app/helpers/blob_helper.rb b/app/helpers/blob_helper.rb index e964d7a5e1690b..5eef82acac3b61 100644 --- a/app/helpers/blob_helper.rb +++ b/app/helpers/blob_helper.rb @@ -1,6 +1,13 @@ module BlobHelper - def highlight(blob_name, blob_content, repository: nil, plain: false) + def highlight(blob_name, blob_content, repository: nil, plain: false, highlighted_terms: []) highlighted = Gitlab::Highlight.highlight(blob_name, blob_content, plain: plain, repository: repository) + + if highlighted_terms.any? + escaped_terms = highlighted_terms.map {|term| Regexp.escape(term) } + regexp = Regexp.new("(#{escaped_terms.join('|')})") + highlighted.gsub!(regexp, '\1').html_safe + end + raw %(
#{highlighted}
) end diff --git a/app/models/blob.rb b/app/models/blob.rb index 954d4e4d779c95..c07231a8e0d1f8 100644 --- a/app/models/blob.rb +++ b/app/models/blob.rb @@ -102,6 +102,10 @@ def no_highlighting? raw_size && raw_size > MAXIMUM_TEXT_HIGHLIGHT_SIZE end + def highlighted_terms + [] + end + def empty? raw_size == 0 end diff --git a/app/views/search/results/_blob.html.haml b/app/views/search/results/_blob.html.haml index b90ee509ece12a..c1827dbfbe403c 100644 --- a/app/views/search/results/_blob.html.haml +++ b/app/views/search/results/_blob.html.haml @@ -1,23 +1,2 @@ -- project = @project || find_project_for_blob(blob) - -- if blob.is_a?(Array) - - file_name, blob = blob -- else - - blob = parse_search_result(blob) - - file_name = blob.filename - -- blob_link = project_blob_path(project, tree_join(blob.ref, file_name)) -.blob-result - .file-holder - .js-file-title.file-title - = link_to blob_link do - = icon('fa-file') - %strong - - if @project - = file_name - - else - #{project.name_with_namespace}: - %i= file_name - - if blob.data - .file-content.code.term - = render 'shared/file_highlight', blob: blob, first_line_number: blob.startline, blob_link: blob_link +- partial_name = current_application_settings.elasticsearch_search? ? 'blob_es' : 'blob_regular' += render "search/results/#{partial_name}", blob: blob diff --git a/app/views/search/results/_blob_es.html.haml b/app/views/search/results/_blob_es.html.haml new file mode 100644 index 00000000000000..e6b8fb5b0ae065 --- /dev/null +++ b/app/views/search/results/_blob_es.html.haml @@ -0,0 +1,23 @@ +- project = @project || find_project_for_blob(blob) +- blobs = parse_search_result(blob) +- blob_link = namespace_project_blob_path(project.namespace, project, tree_join(blobs[:ref], blobs[:filename])) + +.blob-result + .file-holder + .js-file-title.file-title + = link_to blob_link do + = icon('fa-file') + %strong + - if @project + = blobs[:filename] + - else + #{project.name_with_namespace}: + %i= blobs[:filename] + + - blobs[:blobs].each do |blob| + - if blob.data + .file-content.code.term + = render 'shared/file_highlight', blob: blob, first_line_number: blob.startline, blob_link: blob_link + - if blob != blobs[:blobs].last + .file-content.code.term + \--- diff --git a/app/views/search/results/_blob_regular.html.haml b/app/views/search/results/_blob_regular.html.haml new file mode 100644 index 00000000000000..b90ee509ece12a --- /dev/null +++ b/app/views/search/results/_blob_regular.html.haml @@ -0,0 +1,23 @@ +- project = @project || find_project_for_blob(blob) + +- if blob.is_a?(Array) + - file_name, blob = blob +- else + - blob = parse_search_result(blob) + - file_name = blob.filename + +- blob_link = project_blob_path(project, tree_join(blob.ref, file_name)) +.blob-result + .file-holder + .js-file-title.file-title + = link_to blob_link do + = icon('fa-file') + %strong + - if @project + = file_name + - else + #{project.name_with_namespace}: + %i= file_name + - if blob.data + .file-content.code.term + = render 'shared/file_highlight', blob: blob, first_line_number: blob.startline, blob_link: blob_link diff --git a/app/views/shared/_file_highlight.html.haml b/app/views/shared/_file_highlight.html.haml index 8d64cb5d698b81..bd08005e888f87 100644 --- a/app/views/shared/_file_highlight.html.haml +++ b/app/views/shared/_file_highlight.html.haml @@ -13,4 +13,8 @@ = link_icon = i .blob-content{ data: { blob_id: blob.id } } - = highlight(blob.path, blob.data, repository: repository, plain: blob.no_highlighting?) + = highlight(blob.path, + blob.data, + repository: repository, + plain: blob.no_highlighting?, + highlighted_terms: blob.highlighted_terms) diff --git a/lib/gitlab/elastic/search_results.rb b/lib/gitlab/elastic/search_results.rb index ad061eccb3a832..ef4843931e9f54 100644 --- a/lib/gitlab/elastic/search_results.rb +++ b/lib/gitlab/elastic/search_results.rb @@ -68,46 +68,42 @@ def single_commit_result? end def self.parse_search_result(result) - ref = result["_source"]["blob"]["commit_sha"] - filename = result["_source"]["blob"]["path"] + ref = result['_source']['blob']['commit_sha'] + filename = result['_source']['blob']['path'] extname = File.extname(filename) basename = filename.sub(/#{extname}$/, '') - content = result["_source"]["blob"]["content"] - total_lines = content.lines.size - - highlighted_content = result["highlight"]["blob.content"] - term = highlighted_content && highlighted_content[0].match(/gitlabelasticsearch→(.*?)←gitlabelasticsearch/)[1] - - found_line_number = 0 - - content.each_line.each_with_index do |line, index| - if term && line.include?(term) - found_line_number = index - break + content = result['_source']['blob']['content'] + + highlighted_content = result['highlight']['blob.content'] + blobs_found_by_filename = result['highlight']['blob.file_name'] + found_file = { filename: filename, ref: ref, blobs: []} + + if highlighted_content + found_file[:blobs] = highlighted_content.map do |snippet| + snippet_processor = ::Gitlab::Elastic::SnippetProcessor.new(content, snippet) + + ::Gitlab::SearchResults::FoundBlob.new( + filename: filename, + basename: basename, + ref: ref, + startline: snippet_processor.startline, + data: snippet_processor.clean_snippet, + highlighted_terms: snippet_processor.highlighted_terms + ) end end + + if blobs_found_by_filename + found_file[:blobs] << ::Gitlab::SearchResults::FoundBlob.new( + filename: filename, + basename: basename, + ref: ref, + startline: 1, + data: content.lines[0..3].join + ) + end - from = if found_line_number >= 2 - found_line_number - 2 - else - found_line_number - end - - to = if (total_lines - found_line_number) > 3 - found_line_number + 2 - else - found_line_number - end - - data = content.lines[from..to] - - ::Gitlab::SearchResults::FoundBlob.new( - filename: filename, - basename: basename, - ref: ref, - startline: from + 1, - data: data.join - ) + found_file end private diff --git a/lib/gitlab/elastic/snippet_processor.rb b/lib/gitlab/elastic/snippet_processor.rb new file mode 100644 index 00000000000000..9ddfb16b1c2205 --- /dev/null +++ b/lib/gitlab/elastic/snippet_processor.rb @@ -0,0 +1,34 @@ +module Gitlab + module Elastic + class SnippetProcessor + def initialize(whole_document, snippet) + @whole_document = whole_document + @snippet = snippet + end + + def startline + index = @whole_document.rindex(clean_snippet) + @whole_document[0, index].count("\n") + 1 + end + + # Returns the origin snippet without any highlight tags + def clean_snippet + snippet_processed.gsub(/gitlabelasticsearch→|←gitlabelasticsearch/, '') + end + + # Returns highlighted terms + def highlighted_terms + snippet_processed.scan(/gitlabelasticsearch→(.*?)←gitlabelasticsearch/).flatten.uniq + end + + private + + # Removes a first line if it does not have the highlight tags + # It's needed because ES can return not the full first code line which is + # unthinkable to keep as is. + def snippet_processed + @snippet_processed ||= @snippet.sub(/^((?!gitlabelasticsearch→).)*[\n\r]/, '') + end + end + end +end diff --git a/lib/gitlab/search_results.rb b/lib/gitlab/search_results.rb index efe8095beea31e..f4f4a2ede30533 100644 --- a/lib/gitlab/search_results.rb +++ b/lib/gitlab/search_results.rb @@ -1,7 +1,7 @@ module Gitlab class SearchResults class FoundBlob - attr_reader :id, :filename, :basename, :ref, :startline, :data + attr_reader :id, :filename, :basename, :ref, :startline, :data, :highlighted_terms def initialize(opts = {}) @id = opts.fetch(:id, nil) @@ -10,12 +10,13 @@ def initialize(opts = {}) @ref = opts.fetch(:ref, nil) @startline = opts.fetch(:startline, nil) @data = opts.fetch(:data, nil) + @highlighted_terms = opts.fetch(:highlighted_terms, []) end def path filename end - + def no_highlighting? false end -- GitLab From 44a185a86a4c0065428f01cf229d4b5d71310a87 Mon Sep 17 00:00:00 2001 From: Valery Sizov Date: Tue, 8 Aug 2017 14:57:38 +0300 Subject: [PATCH 2/3] Fix snippet clining[ci skip] --- lib/gitlab/elastic/snippet_processor.rb | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/lib/gitlab/elastic/snippet_processor.rb b/lib/gitlab/elastic/snippet_processor.rb index 9ddfb16b1c2205..bf4eed88915af2 100644 --- a/lib/gitlab/elastic/snippet_processor.rb +++ b/lib/gitlab/elastic/snippet_processor.rb @@ -1,14 +1,15 @@ module Gitlab module Elastic class SnippetProcessor - def initialize(whole_document, snippet) - @whole_document = whole_document + def initialize(document, snippet) + @document = document @snippet = snippet end def startline - index = @whole_document.rindex(clean_snippet) - @whole_document[0, index].count("\n") + 1 + index = @document.rindex(clean_snippet) + # byebug if index.nil? + @document[0, index].count("\n") + 1 end # Returns the origin snippet without any highlight tags @@ -24,10 +25,10 @@ def highlighted_terms private # Removes a first line if it does not have the highlight tags - # It's needed because ES can return not the full first code line which is - # unthinkable to keep as is. + # It's needed because ES can return non-complite code line which is + # unthinkable to keep as is. We should treat code lines as atomic entitties. def snippet_processed - @snippet_processed ||= @snippet.sub(/^((?!gitlabelasticsearch→).)*[\n\r]/, '') + @snippet_processed ||= @snippet.sub(/\A((?!gitlabelasticsearch→).)*[\n\r]/, '') end end end -- GitLab From 8369fab102fd9e6dd91d95a7810169bfa3908409 Mon Sep 17 00:00:00 2001 From: Valery Sizov Date: Tue, 8 Aug 2017 22:14:12 +0300 Subject: [PATCH 3/3] Ordering and filtering code snippets[ci skip] --- .../{snippet_processor.rb => code_snippet.rb} | 13 ++++++-- lib/gitlab/elastic/code_snippet_collection.rb | 30 +++++++++++++++++++ lib/gitlab/elastic/search_results.rb | 25 +++++++++++----- 3 files changed, 57 insertions(+), 11 deletions(-) rename lib/gitlab/elastic/{snippet_processor.rb => code_snippet.rb} (83%) create mode 100644 lib/gitlab/elastic/code_snippet_collection.rb diff --git a/lib/gitlab/elastic/snippet_processor.rb b/lib/gitlab/elastic/code_snippet.rb similarity index 83% rename from lib/gitlab/elastic/snippet_processor.rb rename to lib/gitlab/elastic/code_snippet.rb index bf4eed88915af2..6c187f2ce05d78 100644 --- a/lib/gitlab/elastic/snippet_processor.rb +++ b/lib/gitlab/elastic/code_snippet.rb @@ -1,6 +1,6 @@ module Gitlab module Elastic - class SnippetProcessor + class CodeSnippet def initialize(document, snippet) @document = document @snippet = snippet @@ -8,10 +8,17 @@ def initialize(document, snippet) def startline index = @document.rindex(clean_snippet) - # byebug if index.nil? @document[0, index].count("\n") + 1 end + def line_range + startline..last_line_number + end + + def last_line_number + startline + snippet_processed.count("\n") + 1 + end + # Returns the origin snippet without any highlight tags def clean_snippet snippet_processed.gsub(/gitlabelasticsearch→|←gitlabelasticsearch/, '') @@ -26,7 +33,7 @@ def highlighted_terms # Removes a first line if it does not have the highlight tags # It's needed because ES can return non-complite code line which is - # unthinkable to keep as is. We should treat code lines as atomic entitties. + # unthinkable to keep as is. We should treat code lines as atomic entities. def snippet_processed @snippet_processed ||= @snippet.sub(/\A((?!gitlabelasticsearch→).)*[\n\r]/, '') end diff --git a/lib/gitlab/elastic/code_snippet_collection.rb b/lib/gitlab/elastic/code_snippet_collection.rb new file mode 100644 index 00000000000000..75ac62cd4074b1 --- /dev/null +++ b/lib/gitlab/elastic/code_snippet_collection.rb @@ -0,0 +1,30 @@ +module Gitlab + module Elastic + class CodeSnippetCollection + SNIPPETS_MAX = 3 + + def initialize(code_snippets) + @code_snippets = code_snippets + end + + def order + @code_snippets.sort!{ |a, b| a.startline <=> b.startline } + self + end + + def remove_overlapped_snippets + filtered = [] + + @code_snippets.each do |snippet| + break if filtered.size == SNIPPETS_MAX + + unless filtered.any?{ |s| snippet.line_range.overlaps? s.line_range } + filtered << snippet + end + end + + filtered + end + end + end +end diff --git a/lib/gitlab/elastic/search_results.rb b/lib/gitlab/elastic/search_results.rb index ef4843931e9f54..7c7d5bd911046c 100644 --- a/lib/gitlab/elastic/search_results.rb +++ b/lib/gitlab/elastic/search_results.rb @@ -1,6 +1,12 @@ module Gitlab module Elastic class SearchResults + # There is probability that we have one huge line where there is a huge number of + # highlighted snippets. To not have any performance problems we need to limit the number + # of snippets we will consider as all the calculation related to snippets is + # pretty expensive. + MAX_SNIPPETS_TO_CONSIDER = 10 + attr_reader :current_user, :query, :public_and_internal_projects # Limit search results by passed project ids @@ -77,22 +83,25 @@ def self.parse_search_result(result) highlighted_content = result['highlight']['blob.content'] blobs_found_by_filename = result['highlight']['blob.file_name'] found_file = { filename: filename, ref: ref, blobs: []} - + if highlighted_content - found_file[:blobs] = highlighted_content.map do |snippet| - snippet_processor = ::Gitlab::Elastic::SnippetProcessor.new(content, snippet) - + # byebug + snippets = highlighted_content.take(MAX_SNIPPETS_TO_CONSIDER).map{ |s| ::Gitlab::Elastic::CodeSnippet.new(content, s)} + + snippets = ::Gitlab::Elastic::CodeSnippetCollection.new(snippets).order.remove_overlapped_snippets + + found_file[:blobs] = snippets.map do |snippet| ::Gitlab::SearchResults::FoundBlob.new( filename: filename, basename: basename, ref: ref, - startline: snippet_processor.startline, - data: snippet_processor.clean_snippet, - highlighted_terms: snippet_processor.highlighted_terms + startline: snippet.startline, + data: snippet.clean_snippet, + highlighted_terms: snippet.highlighted_terms ) end end - + if blobs_found_by_filename found_file[:blobs] << ::Gitlab::SearchResults::FoundBlob.new( filename: filename, -- GitLab