From 6053c8ebadcca1118e00c2f3b2d336d1d31b9cff Mon Sep 17 00:00:00 2001 From: Kevin Novak Date: Fri, 22 Nov 2019 17:51:08 -0500 Subject: [PATCH] Download advanced data as CSV (#14) * Download data as CSV * Refactoring * Add countdown data * Create regex utils, string utils * Fix games with demo --- app.js | 16 +-- public/index.html | 2 + public/scripts/index.js | 125 +++++++++++++++----- services/steam-scraper.js | 242 +++++++++++++++++++++++--------------- utils/regex-utils.js | 30 +++++ utils/string-utils.js | 7 ++ 6 files changed, 288 insertions(+), 134 deletions(-) create mode 100644 utils/regex-utils.js create mode 100644 utils/string-utils.js diff --git a/app.js b/app.js index f03314f..e8bafd9 100644 --- a/app.js +++ b/app.js @@ -12,14 +12,8 @@ function main() { _app.post('/api/app-scrape', async (req, res) => { let appUrl = req.body.url; - let gamePageData = await _steamScraper.getAppPageData(appUrl); - res.json(gamePageData); - }); - - _app.post('/api/headset-scrape', async (req, res) => { - let searchUrl = req.body.url; - let searchPageData = await _steamScraper.getHeadsetsFromAppPage(searchUrl); - res.json(searchPageData); + let appPageData = await _steamScraper.getAppPageData(appUrl); + res.json(appPageData); }); _app.post('/api/search-scrape', async (req, res) => { @@ -28,6 +22,12 @@ function main() { res.json(searchPageData); }); + _app.post('/api/search-app-scrape', async (req, res) => { + let appUrl = req.body.url; + let appPageData = await _steamScraper.getSearchAppPageData(appUrl); + res.json(appPageData); + }); + _app.listen(PORT, () => { console.log(`App listening on port ${PORT}!`); }); diff --git a/public/index.html b/public/index.html index 698849e..5bcf5c8 100644 --- a/public/index.html +++ b/public/index.html @@ -264,6 +264,8 @@
integrity="sha384-JjSmVgyd0p3pXB1rRibZUAYoIIy6OrQ6VrjIEaFf/nJGzIxFDsf4x0xIM+B07jRM" crossorigin="anonymous" > + + diff --git a/public/scripts/index.js b/public/scripts/index.js index b3cf54c..05246cd 100644 --- a/public/scripts/index.js +++ b/public/scripts/index.js @@ -6,7 +6,9 @@ const PERCENT_NUMBER_REGEX = /(\d+)%/; const NEW_LINE = ' '; const MAX_PAGES = 100; -const headsetAliases = { +const BUNDLE_PREFIX = "**Bundle** - "; + +const HEADSET_ALIASES = { 'Valve Index': { shortName: 'Index', abbreviation: 'I' @@ -29,6 +31,10 @@ const headsetAliases = { } } +let cache = { + searchData: [] +} + async function retrieveSteamAppTitle() { let retrievePageButton = document.getElementById('retrieve-steam-app-title'); let pageResultsDiv = document.getElementById('page-results'); @@ -51,24 +57,17 @@ async function retrieveSteamAppTitle() { try { let appData = await post('./api/app-scrape', content); - let discounted = appData.discounted; - let isVr = appData.headsets.length > 0; - - let link = document.createElement('a'); - if (isVr) { + let text = ""; + if (appData.headsets.length > 0) { let platforms = getPlatformText(appData.headsets); - if (discounted) { - link.innerText = `[${platforms}] ${appData.title} (${appData.price} / ${appData.percentOff} off)`; - } else { - link.innerText = `[${platforms}] ${appData.title} (${appData.price})`; - } - } else { - if (discounted) { - link.innerText = `${appData.title} (${appData.price} / ${appData.percentOff} off)`; - } else { - link.innerText = `${appData.title} (${appData.price})`; - } + text += `[${platforms}] ` } + text += `${appData.title} ` + let priceTag = appData.percentOff ? `(${appData.price} / ${appData.percentOff} off)` : `(${appData.price})`; + text += `${priceTag}` + + let link = document.createElement('a'); + link.innerText = text; link.href = appData.link; link.target = '_blank'; link.style.display = 'inline'; @@ -126,12 +125,20 @@ async function retrieveSteamSearchTable() { for (let [index, app] of searchData.entries()) { let itemNumber = index + 1; searchResultsDiv.innerHTML = `Retrieving result ${itemNumber} of ${searchData.length}...`; - app.headsets = []; if (app.type == "APP") { let content = { url: app.link }; - app.headsets = await post('./api/headset-scrape', content); + + let appData = await post('./api/search-app-scrape', content); + app.headsets = appData.headsets || []; + app.countdown = appData.countdown || { text: "", time: 0 }; + } else { + app.headsets = []; + app.countdown = { + text: "", + time: 0 + } } } @@ -143,8 +150,17 @@ async function retrieveSteamSearchTable() { textArea.readOnly = true; textArea.innerHTML = text; + let csv = json2csv.parse(cache.searchData); + + let downloadLink = document.createElement('a'); + downloadLink.href = 'data:text/csv;charset=utf-8,' + encodeURI(csv); + downloadLink.target = '_blank'; + downloadLink.innerHTML = 'Download Raw Data as CSV'; + downloadLink.download = `steam-data-${getFormattedTime()}.csv`; + searchResultsDiv.innerHTML = ""; searchResultsDiv.appendChild(textArea); + searchResultsDiv.appendChild(downloadLink); } catch (error) { console.error(error); searchResultsDiv.innerHTML = "No results."; @@ -153,6 +169,55 @@ async function retrieveSteamSearchTable() { retrieveSearchButton.disabled = false; } +function getFormattedTime() { + let today = new Date(); + let y = today.getFullYear(); + // JavaScript months are 0-based. + let m = today.getMonth() + 1; + let d = today.getDate(); + let h = today.getHours(); + let mi = today.getMinutes(); + let s = today.getSeconds(); + return y + "-" + m + "-" + d + "-" + h + "-" + mi + "-" + s; +} + +function formatAppData(app) { + let formattedData = { + type: "", + platform: "", + platformAbbreviated: "", + title: "", + titleLink: "", + link: "", + price: "", + originalPrice: "", + percentOff: "", + countdownText: "", + countdownTime: 0, + reviews: "", + reviewsCount: "" + } + + formattedData.type = app.type; + formattedData.platform = app.headsets.join(', '); + formattedData.platformAbbreviated = app.headsets.map(platform => getHeadsetAbbreviation(platform)).join('/'); + formattedData.title = app.title; + + let titlePrefix = app.type == "BUNDLE" ? BUNDLE_PREFIX : ""; + formattedData.titleLink = `${titlePrefix}[${escapePipes(app.title)}](${app.link})`; + + formattedData.link = app.link; + formattedData.price = extractNumberFromPrice(app.price) || app.price; + formattedData.originalPrice = extractNumberFromPrice(app.originalPrice) || app.price; + formattedData.percentOff = extractNumberFromPercent(app.percentOff) || app.percentOff; + formattedData.countdownText = app.countdown.text; + formattedData.countdownTime = app.countdown.time; + formattedData.reviews = extractNumberFromPercent(app.reviewsPercent) || app.reviewsPercent; + formattedData.reviewsCount = app.reviewsCount; + + return formattedData; +} + async function retrieveSearchPageData(steamSearchUrl, pageNumber) { let content = { url: `${steamSearchUrl}` @@ -168,20 +233,16 @@ function createMarkdownTable(searchData) { let divider = '| :- | :- | -: | -: | -: | -: |'; let result = header + NEW_LINE + divider + NEW_LINE; - for (let app of searchData) { - let platform = app.headsets.map(platform => getHeadsetAbbreviation(platform)).join('/'); - let title = escapePipes(app.title); - let link = app.link; - let price = extractNumberFromPrice(app.price) || app.price || ""; - let percentOff = extractNumberFromPercent(app.percentOff) || app.percentOff || ""; - let reviews = extractNumberFromPercent(app.reviewsPercent) || app.reviewsPercent || ""; - let reviewsCount = app.reviewsCount || ""; + let formattedData = []; - let bundlePrefix = app.type == "BUNDLE" ? "**Bundle** - " : ""; - - result += `| ${platform} | ${bundlePrefix}[${title}](${link}) | ${price} | ${percentOff} | ${reviews} | ${reviewsCount} |` + NEW_LINE; + for (let app of searchData) { + let formatted = formatAppData(app); + result += `| ${formatted.platformAbbreviated} | ${formatted.titleLink} | ${formatted.price} | ${formatted.percentOff} | ${formatted.reviews} | ${formatted.reviewsCount} |` + NEW_LINE; + formattedData.push(formatted); } + cache.searchData = formattedData; + return result; } @@ -225,7 +286,7 @@ function getPlatformText(platforms) { } function getHeadsetshortName(headsetName) { - let headsetAlias = headsetAliases[headsetName]; + let headsetAlias = HEADSET_ALIASES[headsetName]; if (headsetAlias) { return headsetAlias.shortName; } else { @@ -234,7 +295,7 @@ function getHeadsetshortName(headsetName) { } function getHeadsetAbbreviation(headsetName) { - let headsetAlias = headsetAliases[headsetName]; + let headsetAlias = HEADSET_ALIASES[headsetName]; if (headsetAlias) { return headsetAlias.abbreviation; } else { diff --git a/services/steam-scraper.js b/services/steam-scraper.js index 313fe72..838bdfd 100644 --- a/services/steam-scraper.js +++ b/services/steam-scraper.js @@ -1,5 +1,7 @@ const _cheerio = require('cheerio'); const _rp = require('request-promise'); +const _regexUtils = require('../utils/regex-utils'); +const _stringUtils = require('../utils/string-utils'); const TITLE_REMOVE = [ 'Buy', @@ -9,172 +11,224 @@ const TITLE_REMOVE = [ 'Pre-Purchase' ]; -const PERCENT_REGEX = /(\d+%)/; -const REVIEWS_COUNT_REGEX = /([\d,]+) user review/; +async function getSearchPageData(searchUrl) { + let searchPageHtml = await _rp({ url: searchUrl }); + let $ = _cheerio.load(searchPageHtml); -async function getAppPageData(appUrl) { + let searchResults = Array.from($('#search_resultsRows > a.search_result_row')); + + let searchPageData = []; + for (let searchResult of searchResults) { + let gameData = await getGameDataFromSearchResult(searchResult); + searchPageData.push(gameData); + } + return searchPageData; +} + + +async function getSearchAppPageData(appUrl) { let appPageHtml = await _rp({ url: appUrl }); let $ = _cheerio.load(appPageHtml); - let gameElements = Array.from($('#game_area_purchase .game_area_purchase_game')); - if (gameElements.length < 1) { + let firstGame = getMainGameElement($); + if (!firstGame) { return { error: true, message: "Could not find any game elements." }; } - let firstGame = gameElements[0]; - let gameData = getGameDataFromGameElement(firstGame); + let countdown = getCountdownFromGameElement(firstGame); let headsets = getHeadsets($); return { - link: appUrl, - ...gameData, + countdown, headsets }; } -async function getSearchPageData(searchUrl) { - let searchPageHtml = await _rp({ url: searchUrl }); - let $ = _cheerio.load(searchPageHtml); - - let searchResults = Array.from($('#search_resultsRows > a.search_result_row')); +async function getAppPageData(appUrl) { + let appPageHtml = await _rp({ url: appUrl }); + let $ = _cheerio.load(appPageHtml); - let searchPageData = []; - for (var searchResult of searchResults) { - let gameData = await getGameDataFromSearchResult(searchResult); - searchPageData.push(gameData); + let firstGame = getMainGameElement($); + if (!firstGame) { + return { + error: true, + message: "Could not find any game elements." + }; } - return searchPageData; -} -function extractPercent(input) { - let match = PERCENT_REGEX.exec(input); - if (match) { - return match[1]; - } -} + let gameData = getGameDataFromGameElement(firstGame); + let countdown = getCountdownFromGameElement(firstGame); + let headsets = getHeadsets($); -function extractReviewsCount(input) { - let match = REVIEWS_COUNT_REGEX.exec(input); - if (match) { - return match[1]; - } + return { + link: appUrl, + ...gameData, + countdown, + headsets + }; } -function stripQueryString(url) { - return url.split(/[?#]/)[0]; +function getMainGameElement($) { + let gameElements = Array.from($('#game_area_purchase .game_area_purchase_game:not(.demo_above_purchase)')); + if (gameElements.length < 1) { + return; + } + return gameElements[0]; } function getHeadsets($) { let headsetTitleElement = $('.details_block.vrsupport > div:contains("Headsets")').parent(); let headsetElements = Array.from(headsetTitleElement.nextUntil('.details_block')); + let headsets = []; - for (var headsetElement of headsetElements) { + + for (let headsetElement of headsetElements) { let headsetName = $('.name', headsetElement).text().trim(); if (headsetName) { headsets.push(headsetName); } } - return headsets; -} -async function getHeadsetsFromAppPage(link) { - let pageHtml = await _rp({ url: link }); - let $ = _cheerio.load(pageHtml); - return getHeadsets($); + return headsets; } async function getGameDataFromSearchResult(searchResult) { let $ = _cheerio.load(searchResult); - let title = ""; - let link = ""; - let type = "UNKNOWN"; - let price = ""; - let discounted = false; - let originalPrice = ""; - let percentOff = ""; - let reviewsPercent = ""; - let reviewsCount = ""; + let gameData = { + link: "", + title: "", + type: "UNKNOWN", + price: "", + originalPrice: "", + percentOff: "", + reviewsPercent: "", + reviewsCount: "" + } + + let title = $('div.search_name > span.title').text().trim(); + if (title) { + gameData.title = title; + } + + let link = _stringUtils.stripQueryString(searchResult.attribs.href); + if (link) { + gameData.link = link; + } - title = $('div.search_name > span.title').text().trim(); - link = stripQueryString(searchResult.attribs.href); + if (gameData.link.includes('/app/')) { + gameData.type = "APP"; + } else if (gameData.link.includes('/bundle/')) { + gameData.type = "BUNDLE"; + } - if (link.includes('/app/')) { - type = "APP"; - } else if (link.includes('/bundle/')) { - type = "BUNDLE"; + let price = $('div.search_price').clone().children().remove().end().text().trim(); + if (price) { + gameData.price = price; } - price = $('div.search_price').clone().children().remove().end().text().trim(); - originalPrice = $('div.search_price > span > strike').text().trim(); - percentOff = extractPercent($('div.search_discount > span').text().trim()); + let originalPrice = $('div.search_price > span > strike').text().trim(); + if (originalPrice) { + gameData.originalPrice = originalPrice; + } - if (originalPrice && percentOff) { - discounted = true; + let percentOff = _regexUtils.extractPercent($('div.search_discount > span').text().trim()); + if (percentOff) { + gameData.percentOff = percentOff; } - if (type == "APP") { + if (gameData.type == "APP") { let reviewsSummary = $('div.search_reviewscore > span.search_review_summary').attr('data-tooltip-html'); if (reviewsSummary) { reviewsSummary = reviewsSummary.trim(); - reviewsPercent = extractPercent(reviewsSummary); - reviewsCount = extractReviewsCount(reviewsSummary).replace(/,/g, ''); + let reviewsPercent = _regexUtils.extractPercent(reviewsSummary); + if (reviewsPercent) { + gameData.reviewsPercent = reviewsPercent; + } + + let reviewsCount = _regexUtils.extractReviewsCount(reviewsSummary); + if (reviewsCount) { + gameData.reviewsCount = reviewsCount; + } } } - return { - title, - link, - type, - originalPrice, - discounted, - price, - percentOff, - reviewsPercent, - reviewsCount, - }; + return gameData; } function getGameDataFromGameElement(gameElement) { let $ = _cheerio.load(gameElement); - let title = ""; - let price = ""; - let discounted = false; - let originalPrice = ""; - let percentOff = ""; + let gameData = { + title: "", + price: "", + originalPrice: "", + percentOff: "" + } - title = $('.game_area_purchase_game > h1').children().remove().end().text().trim(); - for (var removeKeyword of TITLE_REMOVE) { + let title = $('.game_area_purchase_game > h1').children().remove().end().text().trim(); + for (let removeKeyword of TITLE_REMOVE) { if (title.startsWith(removeKeyword)) { title = title.substr(removeKeyword.length).trim(); } } - originalPrice = $('.discount_original_price').text().trim(); - percentOff = extractPercent($('.discount_pct').text().trim()); + if (title) { + gameData.title = title; + } - if (originalPrice && percentOff) { - discounted = true; + let originalPrice = $('.discount_original_price').text().trim(); + if (originalPrice) { + gameData.originalPrice = originalPrice; } - price = discounted ? $('.discount_final_price').text().trim() : $('.game_purchase_price').text().trim();; + let percentOff = _regexUtils.extractPercent($('.discount_pct').text().trim()); + if (percentOff) { + gameData.percentOff = percentOff; + } - return { - title, - originalPrice, - discounted, - price, - percentOff - }; + let price = gameData.originalPrice ? $('.discount_final_price').text().trim() : $('.game_purchase_price').text().trim(); + if (price) { + gameData.price = price; + } + + return gameData; +} + +function getCountdownFromGameElement(gameElement) { + let $ = _cheerio.load(gameElement); + + let countdownData = { + text: "", + time: 0 + } + + try { + let text = $('.game_purchase_discount_countdown').text().trim(); + if (text) { + countdownData.text = text; + } + } catch { }; + + + try { + let countdownScript = $('.game_area_purchase_game > script')[0].children[0].data; + let countdownTimeText = _regexUtils.extractDiscountCountdown(countdownScript); + let time = parseInt(countdownTimeText); + if (time) { + countdownData.time = time; + } + } catch { }; + + return countdownData; } module.exports = { getAppPageData, - getHeadsetsFromAppPage, + getSearchAppPageData, getSearchPageData }; \ No newline at end of file diff --git a/utils/regex-utils.js b/utils/regex-utils.js new file mode 100644 index 0000000..8b221ca --- /dev/null +++ b/utils/regex-utils.js @@ -0,0 +1,30 @@ +const PERCENT_REGEX = /(\d+%)/; +const REVIEWS_COUNT_REGEX = /([\d,]+) user review/; +const DISCOUNT_COUNTDOWN_REGEX = /DiscountCountdown,[ ]*([\d]{7,})/; + +function extractPercent(input) { + let match = PERCENT_REGEX.exec(input); + if (match) { + return match[1]; + } +} + +function extractReviewsCount(input) { + let match = REVIEWS_COUNT_REGEX.exec(input); + if (match) { + return match[1].replace(/,/g, ''); + } +} + +function extractDiscountCountdown(input) { + let match = DISCOUNT_COUNTDOWN_REGEX.exec(input); + if (match) { + return match[1]; + } +} + +module.exports = { + extractPercent, + extractReviewsCount, + extractDiscountCountdown +} \ No newline at end of file diff --git a/utils/string-utils.js b/utils/string-utils.js new file mode 100644 index 0000000..75c7823 --- /dev/null +++ b/utils/string-utils.js @@ -0,0 +1,7 @@ +function stripQueryString(url) { + return url.split(/[?#]/)[0]; +} + +module.exports = { + stripQueryString +} \ No newline at end of file