-
-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathupdate-imdb-interests.js
86 lines (69 loc) · 3.31 KB
/
update-imdb-interests.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
// https://www.imdb.com/interest/all/
const _ = require("lodash");
const logger = require("./src/helpers/logger");
const helpers = require("./src/helpers/helpers");
logger.setLevel(0);
helpers.setRequestAsyncDumpToFile(true);
(async () => {
const imdbInterestData = [];
// TODO: load from JSON for prefilling
try {
// TODO: loop
let failedPages = 0;
let counter = 0;
for (counter = 1; counter <= 9999; counter++) {
// NOTE: IMDB JSON is always nested in props.pageProps
const imdbInterestDataItem = {
IMDB_Interest_ID: null, // interestData.id; e.g. "in0000001"
Name: null, // interestData.primaryText.id; e.g. "Action"
Category: null, // interestData.secondaryText.id; e.g. "Genre"
Description: null, // interestData.description.value.plainText; e.g. "The action genre features fast-paced..."
Image_URL: null, // interestData.primaryImage.url; e.g. "https://m.media-amazon.com/images/G/01/imdb/images-ANDW73HA/imdb_fb_logo._CB1542065250_.png"
Image_Caption: null, // interestData.primaryImage.caption.plainText
Similar_Interests_IDs: [], // interestData.similarInterests[].id; e.g. ["in0000002", "in0000003"]
};
const url = `https://www.imdb.com/interest/in0000${counter < 100 ? "0" : ""}${
counter < 10 ? "0" : ""
}${counter}/`;
const response = await helpers.requestAsync(url);
if (response.statusCode > 399) {
throw new Error(`ERROR: IMDB Main Page gave HTTP status code ${response.statusCode}. URL used: ${url}`);
}
const html = response.body;
const jsonDataNext = JSON.parse(
(html.match(/<script id="__NEXT_DATA__" type="application\/json">([\s\S]*?)<\/script>/) || [null, "{}"])[1]
);
const interestData = _.get(jsonDataNext, "props.pageProps.interestData");
imdbInterestDataItem.IMDB_Interest_ID = _.get(interestData, "id");
imdbInterestDataItem.Name = _.get(interestData, "primaryText.id");
imdbInterestDataItem.Category = _.get(interestData, "secondaryText.id");
imdbInterestDataItem.Description = _.get(interestData, "description.value.plainText");
imdbInterestDataItem.Image_URL = _.get(interestData, "primaryImage.url");
imdbInterestDataItem.Image_Caption = _.get(interestData, "primaryImage.caption.plainText");
imdbInterestDataItem.Similar_Interests_IDs = _.get(interestData, "similarInterests.edges", []).map(
(similarInterest) => _.get(similarInterest, "node.id")
);
// logger.log(imdbInterestDataItem);
if (imdbInterestDataItem.IMDB_Interest_ID && imdbInterestDataItem.Name && imdbInterestDataItem.Category) {
logger.log(
"adding/updating",
imdbInterestDataItem.Category,
imdbInterestDataItem.Name,
`(${imdbInterestDataItem.IMDB_Interest_ID})`
);
imdbInterestData.push(imdbInterestDataItem);
} else {
logger.error("cannot add/update, counter:", counter, "failedPages:", failedPages++);
}
if (failedPages > 2) {
break;
}
}
if (counter > 200) {
logger.error("counter reached at least 200, saving json...");
await helpers.writeFileAsync("./data/imdb-interests.json", JSON.stringify(imdbInterestData, null, 2));
}
} catch (error) {
logger.error(`EXCEPTION: ${error}`);
}
})();