Skip to content

Commit

Permalink
save
Browse files Browse the repository at this point in the history
  • Loading branch information
lancejpollard committed Jan 18, 2024
1 parent 6a0ecda commit 1333cc3
Show file tree
Hide file tree
Showing 11 changed files with 167 additions and 50 deletions.
3 changes: 3 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -190,3 +190,6 @@ LABEL org.opencontainers.image.description "A wrapper around a lot of tools to m

RUN /home/python/venv/bin/pip install antlr4-tools
RUN ln -s /home/python/venv/bin/antlr4-parse /usr/bin/antlr4-parse

RUN /home/python/venv/bin/pip install patool
RUN ln -s /home/python/venv/bin/patool /usr/bin/patool
7 changes: 3 additions & 4 deletions code/cli/task.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ import {
convertDocumentWithLibreOffice,
convertDocumentWithPandoc,
slicePdf,
useSplitPdf,
} from '../node/document.js'
import { convertFontWithFontForge } from '../node/font.js'
import { convertVideoWithFfmpeg } from '../node/video.js'
Expand Down Expand Up @@ -168,9 +167,9 @@ export async function call(task: Task, source) {
break
}
case 'slice': {
if (await useSplitPdf('slice', source)) {
return await slicePdf(source)
}
// if (await useSplitPdf('slice', source)) {
// return await slicePdf(source)
// }
break
}
case 'format': {
Expand Down
8 changes: 0 additions & 8 deletions code/node/archive.ts
Original file line number Diff line number Diff line change
@@ -1,13 +1,5 @@
import { handleCommand } from './command.js'

// defineCompress('rar', buildCommandToCreateRar, runRarCommand)

// const IORar = BuildBaseInputFileOutputDirectoryModel.superRefine(
// transform_input_file_output_directory_array(['7z']),
// )

// export const useDecompressWith7z = buildUseDecompress(['7z'], '7z')

// export async function decompressWith7z(source) {
// const input = IO7z.parse(source)
// const cmd = buildCommandToDecompressWith7z(input)
Expand Down
2 changes: 0 additions & 2 deletions code/node/base.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import { RefinementCtx, z } from 'zod'
import { replaceFileExtension } from '../shared/tool.js'
import path from 'path'
import { BuildBaseFileInputModel, Task } from '../shared/index.js'

export const transform_input_output_file =
(a: string, b: string) => (v: any, ctx: RefinementCtx) => {
Expand Down
92 changes: 57 additions & 35 deletions code/node/html.ts
Original file line number Diff line number Diff line change
Expand Up @@ -79,49 +79,71 @@ export async function convertHTMLToPNG(source: ConvertHtmlToPdf) {

const wait = (ms: number) => new Promise(res => setTimeout(res, ms))

// inspectWebpage().then(data => console.log(data))
// inspectWebpage().then(data => console.log(data.fonts))

export async function inspectWebpage(source) {
const b = await getBrowser(undefined)
const p = await b.newPage()
await p.goto(
'https://github.com/mahirshah/css-property-parser/blob/master/src/expandShorthandProperty.js',
)
const scriptList = fs.readFileSync(
'host/code/node/puppeteer/theme.js',
'utf-8',
)
// await p.waitForNavigation({ waitUntil: 'domcontentloaded' })
await p.evaluate(scriptList)

await wait(1000)

let data = await p.evaluate('task.getWebpageData()')
await b.close()
let data
try {
const p = await b.newPage()
const requests: Array<any> = []
const hosts = {}
const session = await p.target().createCDPSession()
await session.send('DOM.enable')
await session.send('CSS.enable')
const fonts = {}
session.on('CSS.fontsUpdated', event => {
if (event.font) {
fonts[event.font.platformFontFamily] = event.font
}
// event will be received when browser updates fonts on the page due to webfont loading.
})
p.on('requestfinished', async request => {
const response = request.response()
const url = new URL(request.url())
const host = `${url.protocol}//${url.hostname}`
hosts[host] = true

const responseHeaders = response?.headers()

const information = {
url: request.url(),
method: request.method(),
requestHeaders: request.headers(),
requestPostData: request.postData(),
responseHeaders: responseHeaders,
}
requests.push(information)
})

await p.goto('https://www.npmjs.com/package/lite-meta-scraper')
const cookies = await p.cookies()
const scriptList = fs.readFileSync(
'host/code/node/puppeteer/theme.js',
'utf-8',
)
// await p.waitForNavigation({ waitUntil: 'load' })
await p.evaluate(scriptList)

await wait(1000)

data = (await p.evaluate('task.getWebpageData()')) as Record<
string,
any
>
data.requests = requests
data.hosts = Object.keys(hosts).sort()
data.cookies = cookies
data.fonts = Object.values(fonts)
} catch (e) {
} finally {
await b.close()
}
return data
}

// https://stackoverflow.com/questions/1403087/how-can-i-convert-an-html-table-to-csv
// https://stackoverflow.com/questions/37498713/how-to-export-an-html-table-as-a-xlsx-file
export async function scrapeHTMLTables(source: ConvertHtmlToPdf) {
const { input, output, viewport, proxy, waitUntil } =
ConvertHtmlToPdfModel.parse(source)

const b = await getBrowser(proxy ? `${proxy}` : undefined)
const p = await b.newPage()
await p.goto(`${input.file.path}`)
await p.waitForNavigation({ waitUntil })
}

export async function scrapeHTMLMetaTags(source: ConvertHtmlToPdf) {
const { input, output, viewport, proxy, waitUntil } =
ConvertHtmlToPdfModel.parse(source)

const b = await getBrowser(proxy ? `${proxy}` : undefined)
const p = await b.newPage()
await p.goto(`${input.file.path}`)
await p.waitForNavigation({ waitUntil })
}

// export async function convertHTMLToPDF(source: ConvertHtmlToPdf) {
// const {
Expand Down
24 changes: 23 additions & 1 deletion code/node/puppeteer/theme.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,29 @@ import { ColorTranslator } from 'colortranslator'

export function getWebpageData() {
const theme = getTheme()
return theme
const meta = getMeta()
return { theme, meta }
}

function getMeta() {
const meta: Array<any> = []
const els = Array.prototype.slice.call(
document.querySelectorAll('head meta'),
)
for (const el of els) {
if (el.getAttribute('property')) {
meta.push({
property: el.getAttribute('property')?.trim(),
content: el.getAttribute('content')?.trim(),
})
} else if (el.getAttribute('name')) {
meta.push({
name: el.getAttribute('name')?.trim(),
content: el.getAttribute('content')?.trim(),
})
}
}
return meta
}

function getTheme() {
Expand Down
2 changes: 2 additions & 0 deletions code/shared/command.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ export const COMMAND_NAME = [
'docx2pdf',
'unoconv',
'gifsicle',
'patool',
] as const

export type CommandName = (typeof COMMAND_NAME)[number]
Expand Down Expand Up @@ -76,6 +77,7 @@ export const COMMAND: Record<CommandName, Array<string> | undefined> = {
? ['docx2pdf']
: undefined,
unoconv: ['unoconv'],
patool: ['patool'],
}

export function getCommand(name: CommandName) {
Expand Down
15 changes: 15 additions & 0 deletions code/shared/http.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import contentTypeParse from 'content-type'
import acceptLanguageParser from 'accept-language-parser'

export function parseHttpContentTypeHeader(value: string) {
return contentTypeParse.parse(value)
}

// {
// code: "en",
// region: "GB",
// quality: 1.0
// },
export function parseHttpAcceptHeader(value: string) {
return acceptLanguageParser.parse(value)
}
39 changes: 39 additions & 0 deletions code/shared/type/source/archive.ts
Original file line number Diff line number Diff line change
Expand Up @@ -161,3 +161,42 @@ export const build_command_to_compress_with_zip: Form = {
},
},
}

export const patool_format: List = {
form: 'list',
list: [
'7z',
'ace',
'adf',
'alzip',
'ape',
'ar',
'arc',
'arj',
'bzip2',
'bzip3',
'cab',
'chm',
'compress',
'cpio',
'deb',
'dms',
'flac',
'gzip',
'iso',
'lrzip',
'lzh',
'lzip',
'lzma',
'lzop',
'rpm',
'rar',
'rzip',
'shn',
'tar',
'xz',
'zip',
'zoo',
'zstandard',
],
}
3 changes: 3 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -46,17 +46,20 @@
"@types/node-forge": "^1.3.11",
"@types/replace-ext": "^2.0.2",
"@types/sanitize-html": "^2.9.5",
"accept-language-parser": "^1.5.0",
"archiver": "^6.0.1",
"bytes": "^3.1.2",
"chalk": "^5.3.0",
"color-convert": "^2.0.1",
"colortranslator": "^4.1.0",
"concurrently": "^8.2.2",
"content-type": "^1.0.5",
"css-background-parser": "^0.1.0",
"css-font-parser": "^2.0.0",
"css-property-parser": "^1.0.6",
"csv-parse": "^5.5.3",
"dayjs": "^1.11.10",
"fast-content-type-parse": "^1.1.0",
"fast-glob": "^3.3.2",
"fflate": "^0.8.1",
"file-type": "^19.0.0",
Expand Down
22 changes: 22 additions & 0 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 1333cc3

Please sign in to comment.