diff --git a/E-Tiller.js b/E-Tiller.js new file mode 100644 index 00000000..4593997a --- /dev/null +++ b/E-Tiller.js @@ -0,0 +1,356 @@ +/* + ***** BEGIN LICENSE BLOCK ***** + + Copyright © 2022 jiaojiaodubai23@gmail.com + + This file is part of Zotero. + + Zotero is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Zotero is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with Zotero. If not, see . + + ***** END LICENSE BLOCK ***** +*/ + +const WEBTYPE = [ + { + name: 'table_richsapn', + pagekey: 'table.front_table #Author', + issingle: true + }, + // 缺失关键信息,无法解析 + // { + // name: 'table_lessspan', + // pagekey: '#QueryUI td > table td > table:nth-child(1) td > table tr:nth-child(4) > td > table #FileTitle', + // issingle: true + // }, + { + name: 'exportable', + pagekey: '#ExportUrl', + issingle: true + }, + { + name: 'view_abstract', + pagekey: 'a[href*="reader/view_abstract.aspx?file_no="]', + issingle: false + }, + { + name: 'artical_abstract', + pagekey: 'a[href*="/article/abstract/"]', + issingle: false + } +]; + +function detectWeb(doc, _url) { + var atts = [ + 'class', + 'id' + ]; + let insite = atts.some((element) => { + let foots = Array.from(doc.querySelectorAll(`div[${element}*="foot"]`)); + return foots.some(foot => (foot.innerText.match(/(技术支持)?.*北京勤云科技发展有限公司/))); + }); + if (!insite) return false; + let validtype = WEBTYPE.find(element => ( + doc.querySelector(element.pagekey) + )); + // Z.debug(`detect type as\n${JSON.stringify(validtype)}`); + if (!validtype) return false; + if (validtype.issingle) { + return 'journalArticle'; + } + else if (getSearchResults(doc, validtype, true)) { + return 'multiple'; + } + return false; +} + +function getSearchResults(doc, type, checkOnly) { + var items = {}; + var found = false; + let tempRows = Array.from(doc.querySelectorAll(type.pagekey)); + var rows = tempRows.filter(element => (!(element.textContent.startsWith('摘要')))); + for (let row of rows) { + let href = row.href; + let title = ZU.trimInternal(row.textContent); + if (!href || !title) continue; + if (checkOnly) return true; + found = true; + items[href] = title; + // Z.debug(`add\n${href}\n${title}\nas item`); + } + return found ? items : false; +} + +async function doWeb(doc, url) { + let validtype = WEBTYPE.find(element => ( + doc.querySelector(element.pagekey) + )); + if (detectWeb(doc, url) == 'multiple') { + let items = await Zotero.selectItems(getSearchResults(doc, validtype, false)); + if (!items) return; + for (let url of Object.keys(items)) { + await doWeb(await requestDocument(url), url); + } + } + else { + switch (validtype.name) { + case 'table_richsapn': + await scrapeElement(doc, url); + break; + case 'exportable': + await scrapeRis(doc, url); + break; + default: + break; + } + } +} + +const TABLEIDMAP = { + title: { path: '#FileTitle' }, + titleTranslation: { path: '#EnTitle' }, + abstractNote: { path: '#Abstract' }, + abstractTranslation: { path: '#EnAbstract' }, + publicationTitle: { + path: '#ReferenceText', + callback: function (text) { + text = text.split('.').split(',')[0]; + return text; + } + }, + volume: { + path: '#ReferenceText', + callback: function (text) { + text = text.split('.') + .pop().split(',') + .pop(); + return text.match(/\d+(?=())/)[0]; + } + }, + issue: { + path: '#ReferenceText', + callback: function (text) { + text = text.split('.').pop().split(',') + .pop(); + return text.match(/\(\d+\)/)[0].slice(1, -1); + } + }, + pages: { + path: '#ReferenceText', + callback: function (text) { + return text.split(':').pop(); + } + }, + date: { + path: '#ReferenceText', + callback: function (text) { + return text.split('.').pop().split(',')[0]; + } + }, + DOI: { path: '#DOI' }, + creators: { + path: '#Author > table tr td:first-of-type', + multiple: true, + callback: function (arr) { + if (arr[0] == '作者') arr.shift(); + arr = arr.map(element => (element.replace(/\d/g, ''))); + arr = cleanArr(arr); + arr = arr.filter(element => (element.length > 1)); + return arr.map(element => (matchCreator(element))); + } + }, + tags: { + path: '#KeyWord a', + multiple: true, + callback: function (arr) { + arr = cleanArr(arr); + return arr.map(keyword => ( + { tag: keyword } + )); + } + } +}; + +function cleanArr(arr) { + let seprator = /[,,;;]\s?/; + if (arr.length == 1 && arr[0].match(seprator)) { + arr = arr[0].split(seprator); + } + return arr; +} + +function matchCreator(creator) { + // Z.debug(creators); + if (/[A-Za-z]/.test(creator)) { + creator = ZU.cleanAuthor(creator, "author"); + } + else { + creator = creator.replace(/\s/g, ''); + creator = { + lastName: creator, + creatorType: "author", + fieldMode: 1 + }; + } + return creator; +} + +async function scrapeElement(doc, url = doc.location.href) { + var newItem = new Z.Item('journalArticle'); + for (const field in TABLEIDMAP) { + const recipe = TABLEIDMAP[field]; + // Z.debug(`for ${field}, use selector path ${recipe.path}`); + var result; + try { + if (recipe.multiple) { + result = Array.from(doc.querySelectorAll(recipe.path)).map( + element => (element.innerText) + ); + } + else { + result = doc.querySelector(recipe.path); + result = result.innerText; + } + if (recipe.callback) { + result = recipe.callback(result); + } + // Z.debug(`return result:`); + // Z.debug(result); + } + catch (error) { + result = ""; + } + newItem[field] = result; + } + // 有些网页加载异常就会获取不到标题,只能用英文标题暂替 + if (newItem.title == '') newItem.title = newItem.titleTranslation; + var pdfURL = doc.querySelector('#URL').href; + newItem.attachments.push({ + url: pdfURL, + title: 'Full Text PDF', + mimeType: 'application/pdf' + }); + newItem.attachments.push({ + url: url, + title: 'Snapshot', + mimeType: 'text/html' + }); + newItem.complete(); +} + +const METAMAP = { + volume: { + tag: 'name', + value: 'citation_volume' + }, + issue: { + tag: 'name', + value: 'citation_issue' + }, + date: { + tag: 'name', + value: 'citation_date' + }, + journalAbbreviation: { + tag: 'name', + value: 'citation_journal_abbrev' + }, + language: { + tag: 'http-equiv', + value: 'Content-Language' + }, + ISSN: { + tag: 'name', + value: 'citation_issn' + }, + firstpage: { + tag: 'name', + value: 'citation_firstpage' + }, + lastpage: { + tag: 'name', + value: 'citation_lastpage' + } +}; + +async function scrapeRis(doc, url = doc.location.href) { + var risURL = doc.querySelector('#ExportUrl').href.split('/'); + let id = risURL.pop(); + risURL = risURL.join('/'); + // Z.debug(risURL); + let pdfURL = doc.querySelector('#PdfUrl').href; + // Z.debug(pdfURL); + let risText = await requestText( + risURL, + { + method: 'POST', + body: `export_type=ris&include_content=2&article_list=${id}&action_type=export` + }); + let translator = Zotero.loadTranslator('import'); + translator.setTranslator('32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7'); // RIS + translator.setString(risText); + translator.setHandler('itemDone', (_obj, item) => { + for (const field in METAMAP) { + const recipe = METAMAP[field]; + let result; + try { + result = doc.querySelector(`head > meta[${recipe.tag}="${recipe.value}"]`).content; + } + catch (error) { + result = ''; + } + // Z.debug(`in field ${field},`) + // Z.debug(`we already have:\n${item[field]}`); + // Z.debug(`and now we get from meta:\n${result}`); + if (!item[field] && result) { + item[field] = result; + } + } + item.creators = item.creators.map((creator) => { + if (!/[A-Za-z]/.test((creator).lastName)) { + return matchCreator((creator).lastName); + } + return (creator); + }); + item.pages = (function () { + let firstpage = item.firstpage; + let lastpage = item.lastpage; + delete item.firstpage; + delete item.lastpage; + if (firstpage && lastpage) { + return `${firstpage}-${lastpage}`; + } + else if (firstpage || lastpage) { + return [firstpage, lastpage].find(page => (page)); + } + else { + return ''; + } + })(); + item.language = (['zh', 'zh-cn'].includes(item.language)) ? 'zh-CN' : 'en-US'; + item.attachments.push({ + url: pdfURL, + title: 'Full Text PDF', + mimeType: 'application/pdf' + }); + item.attachments.push({ + url: url, + title: 'Snapshot', + mimeType: 'text/html' + }); + item.complete(); + }); + await translator.translate(); +} + + diff --git a/README.md b/README.md index a280ba96..45427108 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,9 @@ - 摘要信息:作者简介 + 内容简介 - 目录信息:目录存放至note附件里 - [详细截图](https://github.com/Captain2021/myTranslator/tree/main) ++ 勤云科技 -> [E-Tiller](/E-Tiller.js) + - 支持最新、次新版的勤云科技期刊网站 + - 对照meta标签补全导出结果 + 仁和汇智 -> [RHHZ.js](./RHHZ.js) - [x] 抓取引文信息 - [x] 支持单条目 diff --git a/data/data.json b/data/data.json index 36dbef10..b689e280 100644 --- a/data/data.json +++ b/data/data.json @@ -26,6 +26,7 @@ "PKULaw": "北大法宝", "China Judgements Online": "中国裁判文书网", "PubScholar": "PubScholar公益学术平台", + "E-Tiller": "勤云科技" "RHHZ": "仁和汇智" "ChinaXiv": "中国科学院科技论文预发布平台", "National Science and Technology Library - China": "国家科技图书文献中心"