-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy patharchive_page.js
258 lines (236 loc) · 8.91 KB
/
archive_page.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
import puppeteer from 'puppeteer';
import fse from 'fs-extra'; // v 5.0.0
import path from 'path';
import fetch from 'node-fetch';
import crypto from 'crypto';
const hash = crypto.createHash('sha256');
const CAP_DEBUG_ULR_LENGTH = 40 //capture all
const FAIL_URL = path.resolve('fail-log.csv') //capture all
async function saveResponsedFile(SAVED_PATH, response, recvData, isFetch) {
let type = '';
let hash = crypto.createHash('sha256').update(recvData).digest('hex');
let fileSavePath = path.resolve(SAVED_PATH, 'saved_responsed_file', hash)
try {
type = response.headers()['content-type'];
}
catch (e) {
console.log('Get response file type error, e:', e);
}
try {
// await fse.outputFile(path.resolve(SAVED_PATH,'saved_responsed_file' , hash), recvData);
await fse.appendFile(path.resolve(SAVED_PATH, 'url_hash_log.csv'), '\n"' + response.url() + '","' + hash + '","' + type + '","' + recvData.length + '","' + isFetch + '"');
// console.log("response url:", response.url().substr(0, CAP_DEBUG_ULR_LENGTH), "\n hash:", hash, "\n dir:",fileSavePath);
}
catch (e) {
console.log('File save error, get e:', e);
console.log(' |Saved ', path.resolve(SAVED_PATH, 'saved_responsed_file', crypto.createHash('sha256').update(recvData).digest('hex')));
console.log(' |URL ', response.url());
}
}
/**
* @archiveUrlString duong dan den trang web, yeu cau co http:// hoac https://
**/
async function archiveFunc(archiveUrlString) {
const START_TIME = new Date();
let STD_REQUEST_LOG_PATH;
let STD_URL_HASH_PATH;
let SAVED_PATH;
let archiveUrl;
let archiveUrlStatus;
let TIMEOUT_FLAG = false;
/**
* Xu ly dau vao
* Temp_name (thay toan bo ky tu :/\*?<>| bang _)
* Tao diem luu rieng biet cho URL
*/
try {
archiveUrl = new URL(archiveUrlString);
}
catch (e) {
console.log('URL Error, get ', e);
return 1;
}
/** make dir
* Tao Duong dan de luu file
*/
let temp_name = archiveUrlString.replace(/:|\/|\\|\*|\?|\"\<|\>|\|/g, '_');
SAVED_PATH = path.resolve('saved', temp_name);
console.log(SAVED_PATH);
try {
await fse.ensureDir(SAVED_PATH);
}
catch (e) {
console.log('Create path error, get ', e, '\nPATH:', SAVED_PATH);
return 2;
}
STD_REQUEST_LOG_PATH = path.resolve(SAVED_PATH, 'archived_log.csv');
STD_URL_HASH_PATH = path.resolve(SAVED_PATH, 'url_hash_log.csv');
let STD_TITLE_FEATURE_PATH = path.resolve(SAVED_PATH, 'title_log.csv');
let STD_TEXT_ONLY_PATH = path.resolve(SAVED_PATH, 'text_only_index_loaded.html.txt');
try {
await fse.writeFile(STD_REQUEST_LOG_PATH, 'timestamp,type,url,status')
await fse.writeFile(STD_URL_HASH_PATH, 'url,hash,content-type,length,isFetch')
await fse.remove(STD_TEXT_ONLY_PATH)
}
catch (e) {
console.log('Initilize log file problem, Error:', e);
return 3;
}
/**
* Tao puppeteer browser, 20s dong, truy cap archive URL
*/
const browser = await puppeteer.launch();
// const browser = await puppeteer.launch({headless: false}); // DEBUG
setTimeout(async () => {
if (TIMEOUT_FLAG == false) {
TIMEOUT_FLAG = true;
try {
fse.outputFile(STD_TITLE_FEATURE_PATH, await page.title() + '\n' + page.url());
console.log('Archive url status:', archiveUrlStatus);
const html = await page.content();
let fileSavePath = path.resolve(SAVED_PATH, `index_loaded.html`);
await fse.outputFile(fileSavePath, html);
if ((await page.$eval('html', e => e.innerText)).length != 0)
fse.appendFile(STD_TEXT_ONLY_PATH, await page.$eval('html', e => e.innerText));
if ((await page.title()).length != 0)
fse.appendFile(STD_TEXT_ONLY_PATH, await page.title());
}
catch (e) {
console.log('Content error, proablly browser already be close, get e:', e.message);
await browser.close();
return (5);
}
await browser.close();
return 0;
}
}, 5000 * 6); //30s cho moi browser
const page = await browser.newPage();
try {
await page.setRequestInterception(true);
await page.setDefaultNavigationTimeout(0);
}
catch (e) {
console.log('setRequestInterception error, get e:', e);
await browser.close();
return 3;
}
page.on('dialog', async dialog => {
if (dialog.message().length != 0)
fse.appendFile(STD_TEXT_ONLY_PATH, dialog.message());
await dialog.dismiss();
});
page.on('request', request => {
const urlString = request.url();
// console.log('Send request to url: ', urlString.substr(0, CAP_DEBUG_ULR_LENGTH));
fse.appendFile(STD_REQUEST_LOG_PATH, '\n"' + (new Date() - START_TIME).toString() + '","Request","' + urlString + '","Sent"');
request.continue();
});
page.on('requestfailed', request => {
const urlString = request.url();
// console.log('Failed request url: ', urlString.substr(0, CAP_DEBUG_ULR_LENGTH));
fse.appendFile(STD_REQUEST_LOG_PATH, '\n"' + (new Date() - START_TIME).toString() + '","Request","' + urlString + '","Failed"');
});
page.on('requestfailed', request => {
const urlString = request.url();
// console.log('Failed request url: ', urlString.substr(0, CAP_DEBUG_ULR_LENGTH));
fse.appendFile(STD_REQUEST_LOG_PATH, '\n"' + (new Date() - START_TIME).toString() + '","Request","' + urlString + '","Failed"');
});
page.on('response', async (response) => {
const request = response.request();
const urlString = request.url();
const url = new URL(urlString);
const status = response.status();
/**
* recvData chua du lieu do server truyen ve may tinh, su dung puppeteer hoac fetch
*/
try {
if ((await response.buffer()).length > 0)
saveResponsedFile(SAVED_PATH, response, await response.buffer(), false);
// console.log("response url:", urlString.substr(0, CAP_DEBUG_ULR_LENGTH), "buffer:\n", response.buffer());
}
catch (e) {
if (e.message == 'Response body is unavailable for redirect responses') {
console.log("Error:", e.message)
}
else {
console.log('Puppeteer fail to get response.buffer() from url:', urlString.substr(0), '\n |try fetch the resource\n |Error:', e.message);
await fetch(urlString)
.then(res => res.text())
.then(body => {
saveResponsedFile(SAVED_PATH, response, body, true);
// console.log("response url:", urlString.substr(0, CAP_DEBUG_ULR_LENGTH), "buffer:\n", body);
})
.catch(function(e) {
console.log('Fetch error, skiping url:', urlString, '\n |Error:', e.message);
});;
}
}
// console.log('Recv response from url:', urlString.substr(0, CAP_DEBUG_ULR_LENGTH), 'status:', status, 'data:', recvData);
fse.appendFile(STD_REQUEST_LOG_PATH, '\n"' + (new Date() - START_TIME).toString() + '","Response","' + urlString + '","' + status + '"')
if (url.href == archiveUrlString || url.href == page.url()) {
archiveUrlStatus = status;
try {
if ((await response.buffer()).length != 0) {
await fse.outputFile(path.resolve(SAVED_PATH, 'index.html'), await response.buffer());
await fse.outputFile(path.resolve(SAVED_PATH, 'saved_responsed_file', crypto.createHash('sha256').update(await response.buffer()).digest('hex')), await response.buffer());
console.log('Saved archiveUrl main index file:', path.resolve(SAVED_PATH, 'index.html'));
}
}
catch (e) {
console.log('Try save main page response error \n |Error:', e.message);
}
}
})
try {
await page.goto(archiveUrl.href, {
waitUntil: 'networkidle2'
});
}
catch (e) {
console.log('Page goto error, get e:', e);
await browser.close();
return (4);
}
setTimeout(async () => {
if (TIMEOUT_FLAG == false) {
TIMEOUT_FLAG = true;
try {
fse.outputFile(STD_TITLE_FEATURE_PATH, await page.title() + '\n' + page.url());
console.log('Archive url status:', archiveUrlStatus);
const html = await page.content();
let fileSavePath = path.resolve(SAVED_PATH, `index_loaded.html`);
await fse.outputFile(fileSavePath, html)
if ((await page.$eval('html', e => e.innerText)).length != 0)
fse.appendFile(STD_TEXT_ONLY_PATH, await page.$eval('html', e => e.innerText));
if ((await page.title()).length != 0)
fse.appendFile(STD_TEXT_ONLY_PATH, await page.title());
}
catch (e) {
console.log('Content error, get e:', e);
await browser.close();
return (5);
}
await browser.close();
return 0;
}
}, 5000 * 3);
}
async function main() {
let all_urls;
all_urls = (await fse.readFile('200-safe-vn.csv', 'utf-8')).split('\r\n');
for (let current of all_urls) {
console.log('============Starting==============\n\t', current, '\n==================================');
try {
if (await archiveFunc(current) > 0) {
fse.appendFile(FAIL_URL, current);
}
}
catch (e) {
console.log('Archive url:', current, ' process gone wroong\nError:', e);
fse.appendFile(FAIL_URL, current + '\n');
}
};
}
// string = 'http://zonehmirrors.org/defaced/2013/10/20/mysafetyshop.net/root.html';
// console.log(archiveFunc(string));
main()