use-case-shopping/logstash.conf

input {

  # read the items file
  file {
    # TODO: fill in the absolute path to the file, or use a relative path to Logstash's home directory
    # path => "/PATH/TO/meta_sports_20k_sample.json"

    # read the file from the beginning
    start_position => "beginning"
    # on Logstash restart, forget where we left off and start over again
    sincedb_path => "/dev/null"
    add_field => { "document_type" => "item" }
  }

  # read the reviews file
  file {
    # TODO: fill in the absolute path to the file, or use a relative path to Logstash's home directory
    # path => "/PATH/TO/reviews_sports_24k_sample.json"
    
    start_position => "beginning"
    sincedb_path => "/dev/null"
    # this file is  actually line-delimited JSON, so we use the json codec
    codec => "json"
    add_field => { "document_type" => "review" }
  }
}

filter {

  #############
  ### ITEMS ###
  #############
  if [document_type] == "item" {
    # parse the python literal format
    ruby {
        code => '
        require "json"
        python_literal = event.get("message")
        parsed_data = eval(python_literal)
        
        # Copy each field to the root level, converting symbol keys to strings
        parsed_data.each do |key, value|
            event.set(key.to_s, value)
        end
        '
    }

    # add a random timestamp to the document, up to one year ago
    ruby {
        code => "
        current_time = Time.now.to_i
        one_year_ago = current_time - (60 * 60 * 24 * 365)
        event.set('timestamp', rand(one_year_ago..current_time))
        "
    }

    # In the data set, categories are of the form
    #   [ ["Sports & Outdoors", "Accessories", "Sport Watches"] ]
    # For filtering on categories, these should be matched exactly, so we transform to
    #   [ "Sports & Outdoors", "Sports & Outdoors|Accessories", "Sports & Outdoors|Accessories|Sport Watches"]
    # because there are multiple subcategories with the same name, and
    # we want to maintain the category hierarchy for grouping.
    # For free text search however, we want to match on stemmed terms.
    # We have another field for this, and reverse the categories for better relevance:
    #   "Sport Watches Accessories Sports & Outdoors"
    ruby {
        code => "
        if event.get('categories')
            categories = event.get('categories')
            if categories.is_a?(Array)
            # Transform categories into the desired format
            transformed = []
            categories_text = []
            
            categories.each do |cat_array|
                current_path = ''
                
                # Build the hierarchical paths
                cat_array.each do |category|
                current_path = current_path.empty? ? category : current_path + '|' + category
                transformed << current_path
                end
                
                # Add reversed categories for text search
                categories_text << cat_array.reverse.join(' ')
            end
            
            # Set the transformed categories back to the event
            event.set('categories', transformed)
            event.set('categories_text', categories_text.join(' '))
            end
        end
        "
    }

    # Flatten related products arrays into a single array
    ruby {
        code => "
        if event.get('related')
            related = []
            event.get('related').each do |_, products|
            related.concat(products)
            end
            event.set('related', related.uniq)
        end
        "
    }

    # Convert imUrl to images array
    ruby {
        code => "
        if imUrl = event.get('imUrl')
            event.set('images', [imUrl])
            event.remove('imUrl')
        end
        "
    }

    # initialize rating_stars and rating_count to 0
    mutate {
        add_field => { "rating_stars" => 0 }
        add_field => { "rating_count" => 0 }
    }

    # the Vespa ID is the product ID
    mutate {
        add_field => { "vespa_id" => "%{asin}" }
    }

  }


  ###############
  ### REVIEWS ###
  ###############
  if [document_type] == "review" {
    # Check for illegal characters and drop the document if found
    ruby {
      code => "
        illegal_char = '\u001a'
        
        if event.get('reviewer_name')&.include?(illegal_char) ||
           event.get('title')&.include?(illegal_char) ||
           event.get('text')&.include?(illegal_char)
          event.cancel
        end
      "
    }

    # rename fields to match the Vespa schema
    mutate {
        rename => { "reviewerID" => "reviewer_id" }
        rename => { "reviewerName" => "reviewer_name" }
        rename => { "unixReviewTime" => "timestamp" }
        rename => { "reviewText" => "text" }
        rename => { "summary" => "title" }

        rename => { "overall" => "stars" }
        convert => { "stars" => "integer" }
    }

    # Convert helpful array to upvotes and downvotes
    ruby {
      code => "
        if helpful = event.get('helpful')
          upvotes = helpful[0].to_i
          total = helpful[1].to_i
          event.set('upvotes', upvotes)
          event.set('downvotes', total - upvotes)
          event.remove('helpful')
        end
      "
    }

    # the Vespa ID is the product ID + the reviewer ID
    mutate {
        add_field => { "vespa_id" => "%{asin}-%{reviewer_id}" }
    }
  }

  ##############
  ### COMMON ###
  ##############
  mutate {
    # remove unnecessary fields
    remove_field => ["@timestamp", "@version", "event", "host", "log",
                    "message", "file", "original", "salesRank", "reviewTime"]
  }

}

output {
  #stdout { codec => rubydebug }

  # write to Vespa
  vespa_feed {
    vespa_url => "http://localhost:8080"
    document_type => "%{document_type}"
    namespace => "%{document_type}"

    operation => "put"
    id_field => "vespa_id"
    # remove the id field from the document itself (we have this info in other fields of the documents anyway)
    remove_id => true

    # remove the "document_type" field from the document, which provides the document type (and the namespace)
    # this is the metadata field that we added in the "input" section
    remove_document_type => true
  }
}