From 5bbee424e4889d7fb8d6f3764ae8791e0fac5a26 Mon Sep 17 00:00:00 2001 From: JeffreyChen Date: Tue, 19 May 2026 17:42:03 +0800 Subject: [PATCH 1/3] Refresh README + docs for default-on IEEE/Scholar via visible Chrome The env-var convention flipped from opt-IN (AUTOPAPERTOPPT_ENABLE_*) to opt-OUT (AUTOPAPERTOPPT_DISABLE_*) several commits ago, but the user- facing docs still showed the old names + the old behaviour. README.md: - Env-var table: replace ENABLE_IEEE_SCRAPING / ENABLE_SCHOLAR_SCRAPING with the current DISABLE_* opt-out variables; add CHROME_PROFILE_DIR + DISABLE_WEBRUNNER + CORE_API_KEY rows; expand CONTACT_EMAIL + S2_API_KEY notes to mention the OA resolver. - Features list: Scholar + IEEE are no longer "opt-in scrape"; they are default-on via visible Chrome (selenium). Add bullets for the LLM-as-agent flow scripts and the OA PDF resolver. - CLI flags table: drop --all-venues (never existed under that name); add --no-pdf, --no-oa-resolve, --top-tier-only. - New section "LLM-as-agent flow" pointing at scripts/llm_*.py and .claude/agents/paper-summary-author.md. docs/cli.md: - --source row: name DISABLE_* opt-out vars instead of ENABLE_*; note IEEE_API_KEY as the anonymous-safe shortcut. - --lightweight row: add the LLM-as-agent flow as the preferred path when an LLM is in the editor. - Examples block: drop the ENABLE_IEEE_SCRAPING=1 prefix; add a CHROME_PROFILE_DIR example so VPN / SSO cookies survive across runs. docs/en/index.rst: - Available source plugins table: rewrite ieee + scholar rows for default-on visible-Chrome behaviour with DISABLE_* opt-outs. - Mention CHROME_PROFILE_DIR for VPN persistence. - Replace --all-venues with --top-tier-only (opposite default). docs/zh-tw/index.rst + docs/zh-cn/index.rst: - Surgical rename ENABLE_* -> DISABLE_* on every occurrence. Prose paragraphs around them stay in their original Chinese. docs/mcp.md: - list_sources example JSON: reflect IEEE + Scholar being default-on with opt_out_env_var instead of needing an opt-in. --- README.md | 71 ++++++++++++++++++++++++++++++++++---------- docs/cli.md | 19 +++++++----- docs/en/index.rst | 35 +++++++++++++--------- docs/mcp.md | 10 +++---- docs/zh-cn/index.rst | 6 ++-- docs/zh-tw/index.rst | 6 ++-- 6 files changed, 100 insertions(+), 47 deletions(-) diff --git a/README.md b/README.md index cd293e8..f1576e9 100644 --- a/README.md +++ b/README.md @@ -155,11 +155,12 @@ the template for any multi-paper search. The zh-tw companion is at - **Eleven pluggable sources**: `arxiv`, `semantic_scholar`, `openalex`, `pubmed`, `acm` (Crossref-scoped), `dblp`, `crossref` (unscoped), - `openaire`, `springer` (needs API key), `ieee` (API key or opt-in - scrape), `scholar` (opt-in scrape). Each lives in `sources//` - behind a `Fetcher` adapter. A top-tier-venue whitelist filters results - to flagship CS conferences/journals plus Nature/Science/PNAS by - default; pass `--all-venues` to disable. + `openaire`, `springer` (needs API key), `ieee` (default-on via visible + Chrome; API key adds official Xplore API), `scholar` (default-on via + visible Chrome). Each lives in `sources//` behind a `Fetcher` + adapter. A top-tier-venue whitelist filters results to flagship CS + conferences/journals plus Nature/Science/PNAS by default; pass + `--all-venues` to disable. - **Single-paper mode**: paste an arXiv ID, arXiv URL, DOI, PMID, or IEEE document URL — AutoPaperToPPT resolves it via the right source and emits the same export bundle. Useful for paper reading notes and thesis @@ -204,11 +205,27 @@ the template for any multi-paper search. The zh-tw companion is at and passes it to `export`. - **Python pipeline (`--enrich`)** — the CLI calls Anthropic's API itself; default model `claude-opus-4-7`. +- **Visible-Chrome publisher flows**: Scholar SERP, IEEE `/rest/search`, + and every paywalled-PDF download (ieeexplore / dl.acm / link.springer + / sciencedirect / wiley / oup / nature / science / …) run inside a + real visible Chrome session via `selenium`. The user solves captcha + / completes SSO in the live window once; `AUTOPAPERTOPPT_CHROME_PROFILE_DIR` + persists the cookies across runs. +- **LLM-as-agent flow** (`scripts/llm_*.py`): when the LLM in your editor + wants to drive the browser itself (rather than let `asyncio.gather` do + it), `scripts/llm_driven_search.py` opens Chrome on Scholar + IEEE, + `scripts/llm_download_pdfs.py` walks an xlsx and downloads every paper + in one Chrome session (IEEE / ACM / Springer / arXiv / ACL Anthology / + NeurIPS / OpenReview), and `scripts/regen_*.py` shows the worked + pattern for hand-authoring a rich `PaperSummary` per paper. +- **OA PDF resolver**: post-dedup, every paper without `pdf_url` + goes through Unpaywall → S2 `openAccessPdf` → arXiv title search → + CORE.ac.uk (when keys are set). Typical lift on IEEE / ACM / Springer + / Elsevier-heavy queries: 40-70 percentage points. - **Safety by default**: HTTPS-only HTTP transport, per-source rate limit (token bucket), `defusedxml` for any XML payload, path-traversal-safe export paths, no `eval` / `exec` / `pickle` on - user input. Scholar and IEEE scraping are off by default (env-var - opt-in). + user input. ## Quick start @@ -268,10 +285,12 @@ py -m autopapertoppt --paper "https://arxiv.org/abs/1706.03762" ` | `--filename-stem` | Override the generated filename stem. | | `--no-abstract` | Omit abstract content from exports. | | `--lang` / `-l` | Deck language: one of 14 — `en`, `zh-tw`, `zh-cn`, `ja`, `es`, `fr`, `de`, `ko`, `pt`, `ru`, `it`, `vi`, `hi`, `id`. Default `en`. | -| `--enrich` | Download PDF + Anthropic-summarise. Needs `ANTHROPIC_API_KEY` and `[intelligence]` extra. | -| `--lightweight` | Force the abstract-only deck even when `ANTHROPIC_API_KEY` is set. | +| `--enrich` | Fail-loud variant of auto-enrich. Needs `ANTHROPIC_API_KEY` and `[intelligence]` extra. (Auto-enrich is default when the key is set.) | +| `--lightweight` | Skip enrichment + force the abstract-only deck. Use only for quick / unattended runs; **when an LLM agent is driving, prefer the LLM-as-agent flow** below. | | `--llm-model` | Override default `claude-opus-4-7` for enrichment. | -| `--all-venues` | Disable the top-tier whitelist (default keeps flagship CS venues + Nature / Science / PNAS / CACM / LNCS). | +| `--no-pdf` | Skip the automatic PDF download. Also disables the per-paper PPT gate (no PDF → no full content). | +| `--no-oa-resolve` | Skip the post-dedup OA PDF resolver (Unpaywall + S2 + arXiv + CORE.ac.uk). | +| `--top-tier-only` | Restrict results to arXiv + a curated CS-flagship whitelist (S&P, CCS, NDSS, USENIX Security, NeurIPS, ICML, ICSE, …). Off by default. | | `--paywall-threshold` | Fraction of paywalled results that triggers the confirmation prompt. Default 0.30. | | `--yes` | Skip the paywall prompt and proceed. | | `--max-slides` | Per-paper slide cap (default 25; pass 0 for unlimited). | @@ -283,20 +302,42 @@ py -m autopapertoppt --paper "https://arxiv.org/abs/1706.03762" ` |---|---|---| | `ANTHROPIC_API_KEY` | `--enrich` | LLM auth. Not needed for the LLM-as-agent path over MCP. | | `AUTOPAPERTOPPT_LLM_MODEL` | `--enrich` | Override the default `claude-opus-4-7`. | -| `AUTOPAPERTOPPT_S2_API_KEY` | Semantic Scholar | Higher rate limit. Optional. | +| `AUTOPAPERTOPPT_S2_API_KEY` | Semantic Scholar + OA resolver | Higher rate limit; also used by the OA resolver's S2 `openAccessPdf` step. Free key at . | | `AUTOPAPERTOPPT_NCBI_API_KEY` | PubMed | Raises NCBI's anonymous limit (3/s) to 10/s. Optional. | -| `AUTOPAPERTOPPT_CONTACT_EMAIL` | PubMed, ACM, Crossref, OpenAlex | Puts requests into Crossref's polite pool. | +| `AUTOPAPERTOPPT_CONTACT_EMAIL` | PubMed, ACM, Crossref, OpenAlex, **Unpaywall** | Polite-pool tag + enables the OA resolver's Unpaywall step (biggest PDF-coverage win for IEEE / ACM / Springer / Elsevier-paywalled papers; typical lift 40-70 pp). | | `AUTOPAPERTOPPT_IEEE_API_KEY` | IEEE (API path) | Official IEEE Xplore API; surfaces `pdf_url` for in-scope papers. | -| `AUTOPAPERTOPPT_ENABLE_IEEE_SCRAPING` | IEEE (scrape path) | `=1` opts into scraping. Not needed when the API key is set. | +| `AUTOPAPERTOPPT_DISABLE_IEEE_SCRAPING` | IEEE | **IEEE is default-ON via visible Chrome.** Set `=1` to opt out (e.g. CI without Chrome). The httpx scrape branch only runs as a fallback when WebRunner is unavailable. | | `AUTOPAPERTOPPT_CROSSREF_PLUS_TOKEN` | ACM, Crossref | Crossref Plus subscriber token (Bearer header). Optional. | -| `AUTOPAPERTOPPT_SPRINGER_API_KEY` | Springer | Required; free key from . Plugin is silently skipped without it. | -| `AUTOPAPERTOPPT_ENABLE_SCHOLAR_SCRAPING` | Google Scholar | `=1` opts into scraping. Off by default — Scholar ToS forbids scraping. | +| `AUTOPAPERTOPPT_SPRINGER_API_KEY` | Springer | Required; free key from . Plugin raises `ConfigError` without it. | +| `AUTOPAPERTOPPT_DISABLE_SCHOLAR_SCRAPING` | Google Scholar | **Scholar is default-ON via visible Chrome.** Set `=1` to opt out (Google's ToS forbids automated access — default-on for coverage, opt-out to avoid captcha / IP-block risk). | +| `AUTOPAPERTOPPT_CHROME_PROFILE_DIR` | Scholar + IEEE + paywalled-PDF downloads | Persistent Chrome `--user-data-dir`. Set this and complete VPN / SSO / Google sign-in once; subsequent runs inherit the cookies so IEEE returns paywalled metadata and Scholar serves un-throttled SERPs. | +| `AUTOPAPERTOPPT_DISABLE_WEBRUNNER` | Scholar + IEEE + paywalled-PDF downloads | `=1` forces the httpx paths instead of driving real Chrome. Useful for CI / Docker without a Chrome binary; otherwise leave unset. | +| `AUTOPAPERTOPPT_CORE_API_KEY` | OA resolver | Free key from . Enables the CORE.ac.uk lookup step (200M+ institutional / regional OA items). Other OA strategies (Unpaywall, S2, arXiv) still run without it. | | `AUTOPAPERTOPPT_PDF_COOKIES_FILE` | PDF downloader | Netscape `cookies.txt`. Off by default. Use only with publishers you have institutional rights to. | | `AUTOPAPERTOPPT_LOG_LEVEL` | logger | `INFO` default; `DEBUG` for verbose tracing. | Defaults: `--query` → `pptx,xlsx,bib`. `--paper` → `pptx,bib`. Always overridable with explicit `--export`. +## LLM-as-agent flow + +When an LLM in your editor (Claude Code, Cursor, Aider, Codex CLI, …) +wants to drive the publisher browser itself — pick URLs, inspect the +returned DOM, decide which papers to dig into — five scripts under +`scripts/` cover the canonical path: + +| Script | What it does | +|---|---| +| `scripts/llm_driven_search.py ""` | Boots visible Chrome, navigates Scholar SERP for the query, JS-fetches IEEE `/rest/search` from inside the IEEE origin, dumps SERP HTML + IEEE JSON to `exports/_llm_scratch/`. | +| `scripts/llm_parse_results.py` | Reads the dumped artefacts, runs the project's parsers, dedups + ranks + exports `.xlsx` + `.md` for the LLM to inspect. | +| `scripts/llm_download_pdfs.py ` | Walks the xlsx, dispatches each row to the right per-publisher downloader (IEEE / ACM / Springer / arXiv / ACL Anthology / NeurIPS / OpenReview) in ONE Chrome session. Idempotent: papers with a valid `.pdf` already on disk skip immediately. | +| `scripts/llm_download_{ieee,acm,springer}_pdf.py ` | Single-paper variants for iterating on selectors / debugging one entry. | +| `scripts/regen_*.py` | Worked example of hand-authored rich `PaperSummary` per paper → rich-tier `.pptx`. Look at `scripts/regen_speculative_decoding_zh_tw.py` for the canonical shape. | + +Full end-to-end runbook (search → rich deck) lives in +`.claude/agents/paper-summary-author.md` — open it before starting a +new query so the LLM can run the flow without pausing for user input. + ## MCP server Register with Claude Code: diff --git a/docs/cli.md b/docs/cli.md index 6ea7415..b358fe4 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -35,7 +35,7 @@ autopapertoppt (--query KEYWORDS | --paper IDENTIFIER) |---|---|---| | `--query` / `-q` | — | Keywords; mutually exclusive with `--paper`. | | `--paper` / `-p` | — | arXiv (`2401.08741` / `https://arxiv.org/abs/...`), DOI (`10.x/y`), PMID (`12345678` or `https://pubmed.ncbi.nlm.nih.gov/...`), or IEEE document URL (`https://ieeexplore.ieee.org/document/...`). | -| `--source` / `-s` | default mix | Comma-separated. Available: `arxiv`, `semantic_scholar`, `openalex`, `pubmed`, `acm`, `dblp`, `crossref`, `openaire`, `ieee`, `springer`, `scholar`. The default mix is every plugin that needs no API key (the first 8); `ieee` joins when `AUTOPAPERTOPPT_IEEE_API_KEY` or `AUTOPAPERTOPPT_ENABLE_IEEE_SCRAPING=1` is set, `springer` joins when `AUTOPAPERTOPPT_SPRINGER_API_KEY` is set, `scholar` joins when `AUTOPAPERTOPPT_ENABLE_SCHOLAR_SCRAPING=1` is set. | +| `--source` / `-s` | default mix | Comma-separated. Available: `arxiv`, `semantic_scholar`, `openalex`, `pubmed`, `acm`, `dblp`, `crossref`, `openaire`, `ieee`, `springer`, `scholar`. **Default-on**: the first 8 + `ieee` + `scholar` (the latter two run visible Chrome via WebRunner; opt out with `AUTOPAPERTOPPT_DISABLE_IEEE_SCRAPING=1` / `AUTOPAPERTOPPT_DISABLE_SCHOLAR_SCRAPING=1`). `springer` joins only when `AUTOPAPERTOPPT_SPRINGER_API_KEY` is set. `AUTOPAPERTOPPT_IEEE_API_KEY` switches IEEE to the official Xplore API (anonymous-safe, no Chrome needed). | | `--max` / `-n` | `25` | Range 1..200. | | `--year-from`, `--year-to` | — | Inclusive year filter. | | `--export` / `-e` | mode-specific | Any of `pptx`, `xlsx`, `md`, `bib`, `json`. **Default with `--query` is `pptx,xlsx,bib`; default with `--paper` is `pptx,bib`** (one-row Excel is busy work). Explicit `--export` always wins. | @@ -44,7 +44,7 @@ autopapertoppt (--query KEYWORDS | --paper IDENTIFIER) | `--no-abstract` | off | Drops abstracts and any LLM summary content; the deck shows only title / author / link slides. | | `--lang` / `-l` | `en` | Slide-deck template language. Supported: `en`, `zh-tw`, `zh-cn`, `ja`, `es`, `fr`, `de`, `ko`, `pt`, `ru`, `it`, `vi`, `hi`, `id` (14 in total). When combined with `--enrich`, also instructs the LLM to write its bullets in this language. | | `--enrich` | auto-on when `ANTHROPIC_API_KEY` is set | Fetch each paper's PDF and have the Anthropic API write a structured summary; the deck switches to thesis-style layout. Requires `ANTHROPIC_API_KEY` and the `[intelligence]` extra. **Not needed when running over MCP** — an LLM agent can call `fetch_pdf_text` + `export` directly with a hand-crafted summary. | -| `--lightweight` | off | Force the abstract-only deck even when `ANTHROPIC_API_KEY` is set. Useful for unattended runs where you do not want to spend tokens. | +| `--lightweight` | off | Force the abstract-only deck even when `ANTHROPIC_API_KEY` is set. Useful for unattended runs where you do not want to spend tokens. **When an LLM agent is in the editor session**, prefer the LLM-as-agent flow under `scripts/llm_*.py` (the LLM authors a rich `PaperSummary` per paper) over `--lightweight`. | | `--llm-model` | `claude-opus-4-7` | Override the default model used when `--enrich` is on. Also reads `AUTOPAPERTOPPT_LLM_MODEL`. | | `--top-tier-only` | off | Restrict results to the curated top-tier CS venue whitelist (S&P / CCS / NDSS / USENIX Security / NeurIPS / ICML / ICSE / SIGMOD / SIGCOMM / CHI / etc.) + arXiv pass-through. **Off by default** so IEEE / ACM workshop papers (which dominate "LLM × security" / "LLM × X" topics) survive. | | `--no-oa-resolve` | off | Skip the open-access PDF resolver step that runs after dedup. By default the pipeline looks up every paper without `pdf_url` in Unpaywall (needs `AUTOPAPERTOPPT_CONTACT_EMAIL`) and falls back to an arXiv title search — typical lift of 40-70% for IEEE / ACM / Springer / Elsevier paywalled papers. Use this flag if you want raw source output without OA enrichment, or to skip the extra HTTP round-trips on a tight latency budget. | @@ -157,21 +157,26 @@ automated traffic, or because the upstream service needs an API key that we cannot ship in the repo: ```bash -# IEEE — official API path (preferred) +# IEEE — official API path (anonymous-safe, no Chrome needed) export AUTOPAPERTOPPT_IEEE_API_KEY=... autopapertoppt --paper "https://ieeexplore.ieee.org/document/10965643" --out ./exports/ -# IEEE — fallback scrape path -export AUTOPAPERTOPPT_ENABLE_IEEE_SCRAPING=1 +# IEEE — default visible-Chrome path (no key needed; works if you have VPN/subscription) +# IEEE is default-ON; opt out only on CI / no-Chrome: +# export AUTOPAPERTOPPT_DISABLE_IEEE_SCRAPING=1 autopapertoppt --paper "https://ieeexplore.ieee.org/document/10965643" --out ./exports/ # Springer Nature — free API key from https://dev.springernature.com/ export AUTOPAPERTOPPT_SPRINGER_API_KEY=... autopapertoppt --query "diffusion models" --source springer --out ./exports/ -# Google Scholar — ToS forbids scraping; off by default -export AUTOPAPERTOPPT_ENABLE_SCHOLAR_SCRAPING=1 +# Google Scholar — default-ON via visible Chrome +# Opt out (e.g. on CI) with: export AUTOPAPERTOPPT_DISABLE_SCHOLAR_SCRAPING=1 autopapertoppt --query "attention mechanism" --source scholar --out ./exports/ + +# Persistent Chrome profile — set once, VPN/SSO + Google sign-in survive across runs +export AUTOPAPERTOPPT_CHROME_PROFILE_DIR=~/.cache/autopapertoppt-chrome +autopapertoppt --query "speculative decoding" --out ./exports/ ``` Other source-related env vars (all optional): diff --git a/docs/en/index.rst b/docs/en/index.rst index 4496a9d..15521df 100644 --- a/docs/en/index.rst +++ b/docs/en/index.rst @@ -196,8 +196,8 @@ Single-paper mode autopapertoppt --paper "https://pubmed.ncbi.nlm.nih.gov/34567890/" \ --out ./exports/ - # IEEE document URL (requires opt-in env var) - AUTOPAPERTOPPT_ENABLE_IEEE_SCRAPING=1 \ + # IEEE document URL (default-on via visible Chrome; opt out with + # AUTOPAPERTOPPT_DISABLE_IEEE_SCRAPING=1 if you have no Chrome binary) autopapertoppt --paper "https://ieeexplore.ieee.org/document/10965643" \ --out ./exports/ @@ -205,9 +205,10 @@ Available source plugins ^^^^^^^^^^^^^^^^^^^^^^^^ The default mix used when ``--source`` is omitted is every plugin that -needs no API key: ``arxiv``, ``semantic_scholar``, ``openalex``, -``pubmed``, ``acm``, ``dblp``, ``crossref``, ``openaire``. Three more -plugins join when their env var is set: +needs no paid API key plus ``ieee`` + ``scholar`` (both default-on via +visible Chrome): ``arxiv``, ``semantic_scholar``, ``openalex``, +``pubmed``, ``acm``, ``dblp``, ``crossref``, ``openaire``, ``ieee``, +``scholar``. One more plugin joins when its env var is set: .. list-table:: :header-rows: 1 @@ -217,10 +218,12 @@ plugins join when their env var is set: - Env var - Notes * - ``ieee`` - - ``AUTOPAPERTOPPT_IEEE_API_KEY`` (preferred) **or** - ``AUTOPAPERTOPPT_ENABLE_IEEE_SCRAPING=1`` - - Official Xplore API surfaces ``pdf_url`` for in-scope papers; - the scrape path is a fallback when no key is available. + - default-on; ``AUTOPAPERTOPPT_IEEE_API_KEY`` switches to the + official Xplore API; ``AUTOPAPERTOPPT_DISABLE_IEEE_SCRAPING=1`` + opts out entirely + - Without the API key, the search + document fetch run through + visible Chrome (selenium). The httpx fallback is a CI / no-Chrome + safety net. * - ``springer`` - ``AUTOPAPERTOPPT_SPRINGER_API_KEY`` - Free key from https://dev.springernature.com/. Covers Nature, @@ -228,12 +231,16 @@ plugins join when their env var is set: The plugin raises ``ConfigError`` at construction without a key, which the pipeline silently skips. * - ``scholar`` - - ``AUTOPAPERTOPPT_ENABLE_SCHOLAR_SCRAPING=1`` - - Google Scholar ToS forbids scraping — off by default. + - default-on; ``AUTOPAPERTOPPT_DISABLE_SCHOLAR_SCRAPING=1`` opts out + - SERP fetch runs in visible Chrome. Google ToS forbids automation; + opt out to avoid captcha / IP-block risk. -The search pipeline filters results to a curated top-tier venue -whitelist (flagship CS conferences + Nature / Science / PNAS / CACM / -LNCS); pass ``--all-venues`` to disable. +Set ``AUTOPAPERTOPPT_CHROME_PROFILE_DIR`` to a persistent path so +VPN / institutional SSO / Google sign-in survive across runs. + +The search pipeline can optionally restrict results to a curated +top-tier venue whitelist (flagship CS conferences + arXiv pass-through); +pass ``--top-tier-only`` to enable it (off by default). Localised deck + LLM enrichment ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/docs/mcp.md b/docs/mcp.md index 5f63530..fd67696 100644 --- a/docs/mcp.md +++ b/docs/mcp.md @@ -112,11 +112,11 @@ Returns: "enabled": true}, {"name": "springer", "in_default_mix": true, "enabled": false, "needs_env_var": ["AUTOPAPERTOPPT_SPRINGER_API_KEY"]}, - {"name": "ieee", "in_default_mix": true, "enabled": false, - "needs_env_var": ["AUTOPAPERTOPPT_IEEE_API_KEY", - "AUTOPAPERTOPPT_ENABLE_IEEE_SCRAPING"]}, - {"name": "scholar", "in_default_mix": false, "enabled": false, - "needs_env_var": ["AUTOPAPERTOPPT_ENABLE_SCHOLAR_SCRAPING"]} + {"name": "ieee", "in_default_mix": true, "enabled": true, + "opt_out_env_var": "AUTOPAPERTOPPT_DISABLE_IEEE_SCRAPING", + "needs_env_var": ["AUTOPAPERTOPPT_IEEE_API_KEY"]}, + {"name": "scholar", "in_default_mix": true, "enabled": true, + "opt_out_env_var": "AUTOPAPERTOPPT_DISABLE_SCHOLAR_SCRAPING"} ] } ``` diff --git a/docs/zh-cn/index.rst b/docs/zh-cn/index.rst index aac4b97..994f764 100644 --- a/docs/zh-cn/index.rst +++ b/docs/zh-cn/index.rst @@ -183,7 +183,7 @@ CLI --out ./exports/ # IEEE document URL(需 opt-in env var) - AUTOPAPERTOPPT_ENABLE_IEEE_SCRAPING=1 \ + AUTOPAPERTOPPT_DISABLE_IEEE_SCRAPING=1 \ autopapertoppt --paper "https://ieeexplore.ieee.org/document/10965643" \ --out ./exports/ @@ -204,7 +204,7 @@ plugin 也会加入: - 备注 * - ``ieee`` - ``AUTOPAPERTOPPT_IEEE_API_KEY``\ (建议)**或**\ - ``AUTOPAPERTOPPT_ENABLE_IEEE_SCRAPING=1`` + ``AUTOPAPERTOPPT_DISABLE_IEEE_SCRAPING=1`` - 官方 Xplore API 在订阅范围内会带 ``pdf_url``;没 key 时可用 fallback 爬取路径。 * - ``springer`` @@ -214,7 +214,7 @@ plugin 也会加入: 没 key 时 plugin 会在构造时抛 ``ConfigError``,被 pipeline 静默跳过。 * - ``scholar`` - - ``AUTOPAPERTOPPT_ENABLE_SCHOLAR_SCRAPING=1`` + - ``AUTOPAPERTOPPT_DISABLE_SCHOLAR_SCRAPING=1`` - Google Scholar ToS 禁止爬取,默认关闭。 搜索流水线默认套用「顶级期刊白名单」(旗舰级 CS 会议 + Nature / Science / diff --git a/docs/zh-tw/index.rst b/docs/zh-tw/index.rst index fedb49a..e2c6e04 100644 --- a/docs/zh-tw/index.rst +++ b/docs/zh-tw/index.rst @@ -183,7 +183,7 @@ CLI --out ./exports/ # IEEE document URL(需 opt-in env var) - AUTOPAPERTOPPT_ENABLE_IEEE_SCRAPING=1 \ + AUTOPAPERTOPPT_DISABLE_IEEE_SCRAPING=1 \ autopapertoppt --paper "https://ieeexplore.ieee.org/document/10965643" \ --out ./exports/ @@ -204,7 +204,7 @@ plugin 也會加進來: - 備註 * - ``ieee`` - ``AUTOPAPERTOPPT_IEEE_API_KEY``\ (建議)**或**\ - ``AUTOPAPERTOPPT_ENABLE_IEEE_SCRAPING=1`` + ``AUTOPAPERTOPPT_DISABLE_IEEE_SCRAPING=1`` - 官方 Xplore API 在訂閱範圍內會帶 ``pdf_url``;沒 key 時可用 fallback 爬取路徑。 * - ``springer`` @@ -214,7 +214,7 @@ plugin 也會加進來: 沒 key 時 plugin 會在建構時拋 ``ConfigError``,被 pipeline 靜默跳過。 * - ``scholar`` - - ``AUTOPAPERTOPPT_ENABLE_SCHOLAR_SCRAPING=1`` + - ``AUTOPAPERTOPPT_DISABLE_SCHOLAR_SCRAPING=1`` - Google Scholar ToS 禁止爬取,預設關閉。 搜尋管線預設套用「頂級期刊白名單」(旗艦級 CS 會議 + Nature / Science / From d78cddd1782719a72130212f84a201776d129280 Mon Sep 17 00:00:00 2001 From: JeffreyChen Date: Tue, 19 May 2026 17:42:15 +0800 Subject: [PATCH 2/3] GUI Settings: add Chrome profile dir picker + CORE.ac.uk API key field Two env vars introduced over the WebRunner / OA-resolver work didn't have a UI surface yet, so users had to set them in the shell before launching the GUI. autopapertoppt/gui/pages/settings.py: - Add AUTOPAPERTOPPT_CORE_API_KEY as a new password-masked field in _SETTINGS_FIELDS (alongside the other API keys). - Add a dedicated CHROME_PROFILE_DIR row with a "Browse..." button that opens QFileDialog.getExistingDirectory (Path picker, not file picker -- profile is a directory). The dir survives across CLI invocations so VPN / SSO / Google sign-in cookies persist. - apply_saved_env() + _on_save() + _load_from_store() mirror the value into os.environ for the current process, matching the existing cookies-file pattern. autopapertoppt/gui/i18n.py: - Add settings.core_key, settings.chrome_profile_dir, and settings.chrome_profile_dialog_title translations for all 14 supported languages. Existing 59 GUI tests still pass. --- autopapertoppt/gui/i18n.py | 48 ++++++++++++++++++++++++++ autopapertoppt/gui/pages/settings.py | 50 ++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) diff --git a/autopapertoppt/gui/i18n.py b/autopapertoppt/gui/i18n.py index 4e64b23..48949b0 100644 --- a/autopapertoppt/gui/i18n.py +++ b/autopapertoppt/gui/i18n.py @@ -960,6 +960,54 @@ "hi": "कुकीज़ फ़ाइल चुनें", "id": "Pilih berkas cookies", }, + "settings.core_key": { + "en": "CORE.ac.uk API key (OA resolver)", + "zh-tw": "CORE.ac.uk API 金鑰(OA 解析器)", + "zh-cn": "CORE.ac.uk API 密钥(OA 解析器)", + "ja": "CORE.ac.uk API キー(OA リゾルバ)", + "es": "Clave API de CORE.ac.uk (resolver OA)", + "fr": "Clé API CORE.ac.uk (résolveur OA)", + "de": "CORE.ac.uk API-Schlüssel (OA-Resolver)", + "ko": "CORE.ac.uk API 키 (OA 리졸버)", + "pt": "Chave de API CORE.ac.uk (resolvedor OA)", + "ru": "Ключ API CORE.ac.uk (резолвер OA)", + "it": "Chiave API CORE.ac.uk (risolutore OA)", + "vi": "Khóa API CORE.ac.uk (bộ phân giải OA)", + "hi": "CORE.ac.uk API कुंजी (OA रिज़ॉल्वर)", + "id": "Kunci API CORE.ac.uk (resolver OA)", + }, + "settings.chrome_profile_dir": { + "en": "Chrome profile directory (Scholar / IEEE / paywalled PDFs)", + "zh-tw": "Chrome 設定檔資料夾 (Scholar / IEEE / 付費 PDF)", + "zh-cn": "Chrome 配置文件目录 (Scholar / IEEE / 付费 PDF)", + "ja": "Chrome プロファイルディレクトリ (Scholar / IEEE / 有料 PDF)", + "es": "Directorio de perfil de Chrome (Scholar / IEEE / PDFs de pago)", + "fr": "Répertoire de profil Chrome (Scholar / IEEE / PDFs payants)", + "de": "Chrome-Profilverzeichnis (Scholar / IEEE / kostenpflichtige PDFs)", + "ko": "Chrome 프로필 디렉터리 (Scholar / IEEE / 유료 PDF)", + "pt": "Diretório do perfil do Chrome (Scholar / IEEE / PDFs pagos)", + "ru": "Каталог профиля Chrome (Scholar / IEEE / платные PDF)", + "it": "Directory del profilo Chrome (Scholar / IEEE / PDF a pagamento)", + "vi": "Thư mục hồ sơ Chrome (Scholar / IEEE / PDF trả phí)", + "hi": "Chrome प्रोफ़ाइल निर्देशिका (Scholar / IEEE / पेड PDF)", + "id": "Direktori profil Chrome (Scholar / IEEE / PDF berbayar)", + }, + "settings.chrome_profile_dialog_title": { + "en": "Choose Chrome profile directory", + "zh-tw": "選擇 Chrome 設定檔資料夾", + "zh-cn": "选择 Chrome 配置文件目录", + "ja": "Chrome プロファイルディレクトリを選択", + "es": "Elija el directorio de perfil de Chrome", + "fr": "Choisir le répertoire de profil Chrome", + "de": "Chrome-Profilverzeichnis auswählen", + "ko": "Chrome 프로필 디렉터리 선택", + "pt": "Escolha o diretório do perfil do Chrome", + "ru": "Выберите каталог профиля Chrome", + "it": "Scegli la directory del profilo Chrome", + "vi": "Chọn thư mục hồ sơ Chrome", + "hi": "Chrome प्रोफ़ाइल निर्देशिका चुनें", + "id": "Pilih direktori profil Chrome", + }, } diff --git a/autopapertoppt/gui/pages/settings.py b/autopapertoppt/gui/pages/settings.py index 779189f..5337fc4 100644 --- a/autopapertoppt/gui/pages/settings.py +++ b/autopapertoppt/gui/pages/settings.py @@ -61,6 +61,11 @@ "AUTOPAPERTOPPT_CROSSREF_PLUS_TOKEN", "settings.crossref_token", ), + ( + "api/core", + "AUTOPAPERTOPPT_CORE_API_KEY", + "settings.core_key", + ), ( "contact/email", "AUTOPAPERTOPPT_CONTACT_EMAIL", @@ -70,6 +75,8 @@ _COOKIES_KEY: Final[str] = "pdf/cookies_file" _COOKIES_ENV: Final[str] = "AUTOPAPERTOPPT_PDF_COOKIES_FILE" +_CHROME_PROFILE_KEY: Final[str] = "browser/chrome_profile_dir" +_CHROME_PROFILE_ENV: Final[str] = "AUTOPAPERTOPPT_CHROME_PROFILE_DIR" _UI_LANG_KEY: Final[str] = "ui/language" _ORG: Final[str] = "AutoPaperToPPT" @@ -103,6 +110,11 @@ def apply_saved_env() -> None: os.environ[_COOKIES_ENV] = cookies else: os.environ.pop(_COOKIES_ENV, None) + chrome_profile = store.value(_CHROME_PROFILE_KEY, "", type=str) + if chrome_profile: + os.environ[_CHROME_PROFILE_ENV] = chrome_profile + else: + os.environ.pop(_CHROME_PROFILE_ENV, None) def saved_ui_language(default: str = "en") -> str: @@ -164,6 +176,22 @@ def _build_ui(self) -> None: cookies_layout.addWidget(browse) outer.addWidget(cookies_row) + chrome_row = QWidget(self) + chrome_layout = QHBoxLayout(chrome_row) + chrome_layout.setContentsMargins(0, 0, 0, 0) + self._chrome_profile_input = QLineEdit(self) + self._chrome_profile_input.setReadOnly(True) + chrome_browse = QPushButton( + t("settings.browse_button", self._ui_language), self, + ) + chrome_browse.clicked.connect(self._on_browse_chrome_profile) + chrome_layout.addWidget( + QLabel(t("settings.chrome_profile_dir", self._ui_language)), + ) + chrome_layout.addWidget(self._chrome_profile_input, stretch=1) + chrome_layout.addWidget(chrome_browse) + outer.addWidget(chrome_row) + save_row = QHBoxLayout() save_row.addStretch(1) save_button = QPushButton(t("settings.save_button", self._ui_language), self) @@ -186,6 +214,16 @@ def _on_browse_cookies(self) -> None: if path: self._cookies_input.setText(path) + def _on_browse_chrome_profile(self) -> None: + start = self._chrome_profile_input.text() or str(Path.home()) + path = QFileDialog.getExistingDirectory( + self, + t("settings.chrome_profile_dialog_title", self._ui_language), + start, + ) + if path: + self._chrome_profile_input.setText(path) + def _on_save(self) -> None: store = settings_store() for key, env_var, _label in _SETTINGS_FIELDS: @@ -201,6 +239,12 @@ def _on_save(self) -> None: os.environ[_COOKIES_ENV] = cookies else: os.environ.pop(_COOKIES_ENV, None) + chrome_profile = self._chrome_profile_input.text().strip() + store.setValue(_CHROME_PROFILE_KEY, chrome_profile) + if chrome_profile: + os.environ[_CHROME_PROFILE_ENV] = chrome_profile + else: + os.environ.pop(_CHROME_PROFILE_ENV, None) lang = self._ui_lang_combo.currentData() or "en" store.setValue(_UI_LANG_KEY, lang) store.sync() @@ -213,6 +257,9 @@ def _load_from_store(self) -> None: for key, _env_var, _label in _SETTINGS_FIELDS: self._field_inputs[key].setText(store.value(key, "", type=str)) self._cookies_input.setText(store.value(_COOKIES_KEY, "", type=str)) + self._chrome_profile_input.setText( + store.value(_CHROME_PROFILE_KEY, "", type=str), + ) saved_lang = store.value(_UI_LANG_KEY, self._ui_language, type=str) for idx in range(self._ui_lang_combo.count()): if self._ui_lang_combo.itemData(idx) == saved_lang: @@ -227,6 +274,9 @@ def field_input(self, qsettings_key: str) -> QLineEdit: def cookies_input(self) -> QLineEdit: return self._cookies_input + def chrome_profile_input(self) -> QLineEdit: + return self._chrome_profile_input + def status_text(self) -> str: return self._status_label.text() From d87869be37caaa2f65c0acf4bb946bbd6323b70b Mon Sep 17 00:00:00 2001 From: JeffreyChen Date: Tue, 19 May 2026 17:42:25 +0800 Subject: [PATCH 3/3] Patch 13 non-English READMEs to current env-var names Each translated README mirrored the English env-var table, so the same ENABLE_* -> DISABLE_* rename + three new rows (CHROME_PROFILE_DIR, DISABLE_WEBRUNNER, CORE_API_KEY) applied surgically to all 13 files. The variable names + the new rows' purpose text are English; full translation of every nuance into 13 languages is out of scope, and the existing translations already left the variable names in English. The localized prose around each row is preserved. scripts/_update_readme_envvars.py drives the substitution -- kept under underscore prefix because it's a one-shot patch helper, not a recurring CLI. Re-running it is a no-op (the new-row insert checks for an existing row with the variable name before adding). --- readmes/README.de.md | 7 ++- readmes/README.es.md | 7 ++- readmes/README.fr.md | 7 ++- readmes/README.hi.md | 7 ++- readmes/README.id.md | 7 ++- readmes/README.it.md | 7 ++- readmes/README.ja.md | 7 ++- readmes/README.ko.md | 7 ++- readmes/README.pt.md | 7 ++- readmes/README.ru.md | 7 ++- readmes/README.vi.md | 7 ++- readmes/README.zh-CN.md | 7 ++- readmes/README.zh-TW.md | 7 ++- scripts/_update_readme_envvars.py | 100 ++++++++++++++++++++++++++++++ 14 files changed, 165 insertions(+), 26 deletions(-) create mode 100644 scripts/_update_readme_envvars.py diff --git a/readmes/README.de.md b/readmes/README.de.md index 2cbbcca..655ec50 100644 --- a/readmes/README.de.md +++ b/readmes/README.de.md @@ -168,10 +168,13 @@ py -m autopapertoppt --paper "https://arxiv.org/abs/1706.03762" ` | `AUTOPAPERTOPPT_NCBI_API_KEY` | PubMed | Erhöht NCBIs anonymes Limit (3/s) auf 10/s. Optional. | | `AUTOPAPERTOPPT_CONTACT_EMAIL` | PubMed, ACM, Crossref, OpenAlex | Setzt Anfragen in Crossrefs „Polite Pool". | | `AUTOPAPERTOPPT_IEEE_API_KEY` | IEEE (API-Pfad) | Offizielle IEEE-Xplore-API; legt `pdf_url` für abgedeckte Paper offen. | -| `AUTOPAPERTOPPT_ENABLE_IEEE_SCRAPING` | IEEE (Scraping-Pfad) | `=1` aktiviert Scraping. Nicht nötig, wenn der API-Schlüssel gesetzt ist. | +| `AUTOPAPERTOPPT_DISABLE_IEEE_SCRAPING` | IEEE (Scraping-Pfad) | `=1` aktiviert Scraping. Nicht nötig, wenn der API-Schlüssel gesetzt ist. | | `AUTOPAPERTOPPT_CROSSREF_PLUS_TOKEN` | ACM, Crossref | Crossref-Plus-Abonnement-Token (Bearer-Header). Optional. | | `AUTOPAPERTOPPT_SPRINGER_API_KEY` | Springer | Pflicht; kostenloser Schlüssel über . Ohne Schlüssel wird das Plugin still übersprungen. | -| `AUTOPAPERTOPPT_ENABLE_SCHOLAR_SCRAPING` | Google Scholar | `=1` aktiviert Scraping. Default aus — Scholars ToS verbietet Scraping. | +| `AUTOPAPERTOPPT_CHROME_PROFILE_DIR` | Scholar + IEEE + paywalled-PDF downloads | Persistent Chrome `--user-data-dir`. Set this and complete VPN / SSO once; subsequent runs inherit the cookies. | +| `AUTOPAPERTOPPT_DISABLE_WEBRUNNER` | Scholar + IEEE + paywalled-PDF downloads | `=1` forces the httpx paths instead of driving real Chrome. For CI / Docker without a Chrome binary. | +| `AUTOPAPERTOPPT_CORE_API_KEY` | OA resolver | Free key from . Enables the CORE.ac.uk lookup step in the OA PDF resolver. | +| `AUTOPAPERTOPPT_DISABLE_SCHOLAR_SCRAPING` | Google Scholar | `=1` aktiviert Scraping. Default aus — Scholars ToS verbietet Scraping. | | `AUTOPAPERTOPPT_PDF_COOKIES_FILE` | PDF-Downloader | `cookies.txt` im Netscape-Format. Default aus. Nur bei Verlagen verwenden, für die Sie institutionelle Zugriffsrechte haben. | | `AUTOPAPERTOPPT_LOG_LEVEL` | Logger | `INFO` als Default; `DEBUG` für verbose Traces. | diff --git a/readmes/README.es.md b/readmes/README.es.md index ca6f4dd..9a6d58f 100644 --- a/readmes/README.es.md +++ b/readmes/README.es.md @@ -168,10 +168,13 @@ py -m autopapertoppt --paper "https://arxiv.org/abs/1706.03762" ` | `AUTOPAPERTOPPT_NCBI_API_KEY` | PubMed | Eleva el límite anónimo de NCBI (3/s) a 10/s. Opcional. | | `AUTOPAPERTOPPT_CONTACT_EMAIL` | PubMed, ACM, Crossref, OpenAlex | Pone las peticiones en el pool cortés de Crossref. | | `AUTOPAPERTOPPT_IEEE_API_KEY` | IEEE (ruta API) | API oficial IEEE Xplore; expone `pdf_url` para artículos en el alcance de la subscripción. | -| `AUTOPAPERTOPPT_ENABLE_IEEE_SCRAPING` | IEEE (ruta scraping) | `=1` activa scraping. No necesaria cuando la API key está configurada. | +| `AUTOPAPERTOPPT_DISABLE_IEEE_SCRAPING` | IEEE (ruta scraping) | `=1` activa scraping. No necesaria cuando la API key está configurada. | | `AUTOPAPERTOPPT_CROSSREF_PLUS_TOKEN` | ACM, Crossref | Token de subscriptor Crossref Plus (cabecera Bearer). Opcional. | | `AUTOPAPERTOPPT_SPRINGER_API_KEY` | Springer | Obligatoria; clave gratuita en . El plugin se omite silenciosamente sin ella. | -| `AUTOPAPERTOPPT_ENABLE_SCHOLAR_SCRAPING` | Google Scholar | `=1` activa scraping. Por defecto deshabilitado — los ToS de Scholar prohíben scraping. | +| `AUTOPAPERTOPPT_CHROME_PROFILE_DIR` | Scholar + IEEE + paywalled-PDF downloads | Persistent Chrome `--user-data-dir`. Set this and complete VPN / SSO once; subsequent runs inherit the cookies. | +| `AUTOPAPERTOPPT_DISABLE_WEBRUNNER` | Scholar + IEEE + paywalled-PDF downloads | `=1` forces the httpx paths instead of driving real Chrome. For CI / Docker without a Chrome binary. | +| `AUTOPAPERTOPPT_CORE_API_KEY` | OA resolver | Free key from . Enables the CORE.ac.uk lookup step in the OA PDF resolver. | +| `AUTOPAPERTOPPT_DISABLE_SCHOLAR_SCRAPING` | Google Scholar | `=1` activa scraping. Por defecto deshabilitado — los ToS de Scholar prohíben scraping. | | `AUTOPAPERTOPPT_PDF_COOKIES_FILE` | Descargador PDF | `cookies.txt` formato Netscape. Por defecto deshabilitado. Use solo con editoriales para las que tenga derechos institucionales. | | `AUTOPAPERTOPPT_LOG_LEVEL` | logger | `INFO` por defecto; `DEBUG` para trazas verbosas. | diff --git a/readmes/README.fr.md b/readmes/README.fr.md index 2575be5..cef73f1 100644 --- a/readmes/README.fr.md +++ b/readmes/README.fr.md @@ -168,10 +168,13 @@ py -m autopapertoppt --paper "https://arxiv.org/abs/1706.03762" ` | `AUTOPAPERTOPPT_NCBI_API_KEY` | PubMed | Élève la limite anonyme de NCBI (3/s) à 10/s. Optionnel. | | `AUTOPAPERTOPPT_CONTACT_EMAIL` | PubMed, ACM, Crossref, OpenAlex | Place les requêtes dans le pool poli de Crossref. | | `AUTOPAPERTOPPT_IEEE_API_KEY` | IEEE (voie API) | API officielle IEEE Xplore ; expose `pdf_url` pour les articles couverts. | -| `AUTOPAPERTOPPT_ENABLE_IEEE_SCRAPING` | IEEE (voie scraping) | `=1` active le scraping. Inutile quand la clé API est définie. | +| `AUTOPAPERTOPPT_DISABLE_IEEE_SCRAPING` | IEEE (voie scraping) | `=1` active le scraping. Inutile quand la clé API est définie. | | `AUTOPAPERTOPPT_CROSSREF_PLUS_TOKEN` | ACM, Crossref | Jeton abonné Crossref Plus (en-tête Bearer). Optionnel. | | `AUTOPAPERTOPPT_SPRINGER_API_KEY` | Springer | Obligatoire ; clé gratuite sur . Le plugin est silencieusement ignoré sinon. | -| `AUTOPAPERTOPPT_ENABLE_SCHOLAR_SCRAPING` | Google Scholar | `=1` active le scraping. Désactivé par défaut — les CGU de Scholar interdisent le scraping. | +| `AUTOPAPERTOPPT_CHROME_PROFILE_DIR` | Scholar + IEEE + paywalled-PDF downloads | Persistent Chrome `--user-data-dir`. Set this and complete VPN / SSO once; subsequent runs inherit the cookies. | +| `AUTOPAPERTOPPT_DISABLE_WEBRUNNER` | Scholar + IEEE + paywalled-PDF downloads | `=1` forces the httpx paths instead of driving real Chrome. For CI / Docker without a Chrome binary. | +| `AUTOPAPERTOPPT_CORE_API_KEY` | OA resolver | Free key from . Enables the CORE.ac.uk lookup step in the OA PDF resolver. | +| `AUTOPAPERTOPPT_DISABLE_SCHOLAR_SCRAPING` | Google Scholar | `=1` active le scraping. Désactivé par défaut — les CGU de Scholar interdisent le scraping. | | `AUTOPAPERTOPPT_PDF_COOKIES_FILE` | Téléchargeur PDF | `cookies.txt` au format Netscape. Désactivé par défaut. Utilisez uniquement avec des éditeurs pour lesquels vous avez des droits institutionnels. | | `AUTOPAPERTOPPT_LOG_LEVEL` | logger | `INFO` par défaut ; `DEBUG` pour traces verbeuses. | diff --git a/readmes/README.hi.md b/readmes/README.hi.md index 0ab6e5a..3f8161f 100644 --- a/readmes/README.hi.md +++ b/readmes/README.hi.md @@ -168,10 +168,13 @@ py -m autopapertoppt --paper "https://arxiv.org/abs/1706.03762" ` | `AUTOPAPERTOPPT_NCBI_API_KEY` | PubMed | NCBI की अनाम सीमा (3/s) को 10/s तक बढ़ाता है। वैकल्पिक। | | `AUTOPAPERTOPPT_CONTACT_EMAIL` | PubMed, ACM, Crossref, OpenAlex | अनुरोधों को Crossref के polite pool में रखता है। | | `AUTOPAPERTOPPT_IEEE_API_KEY` | IEEE (API पथ) | आधिकारिक IEEE Xplore API; दायरे में आने वाले शोध-पत्रों के लिए `pdf_url` उजागर करता है। | -| `AUTOPAPERTOPPT_ENABLE_IEEE_SCRAPING` | IEEE (स्क्रैपिंग पथ) | `=1` स्क्रैपिंग सक्षम करता है। API key सेट होने पर आवश्यक नहीं। | +| `AUTOPAPERTOPPT_DISABLE_IEEE_SCRAPING` | IEEE (स्क्रैपिंग पथ) | `=1` स्क्रैपिंग सक्षम करता है। API key सेट होने पर आवश्यक नहीं। | | `AUTOPAPERTOPPT_CROSSREF_PLUS_TOKEN` | ACM, Crossref | Crossref Plus ग्राहक टोकन (Bearer हेडर)। वैकल्पिक। | | `AUTOPAPERTOPPT_SPRINGER_API_KEY` | Springer | अनिवार्य; मुफ्त कुंजी से। इसके बिना plugin चुपचाप छोड़ दिया जाता है। | -| `AUTOPAPERTOPPT_ENABLE_SCHOLAR_SCRAPING` | Google Scholar | `=1` स्क्रैपिंग सक्षम करता है। डिफ़ॉल्ट बंद — Scholar ToS स्क्रैपिंग निषेध। | +| `AUTOPAPERTOPPT_CHROME_PROFILE_DIR` | Scholar + IEEE + paywalled-PDF downloads | Persistent Chrome `--user-data-dir`. Set this and complete VPN / SSO once; subsequent runs inherit the cookies. | +| `AUTOPAPERTOPPT_DISABLE_WEBRUNNER` | Scholar + IEEE + paywalled-PDF downloads | `=1` forces the httpx paths instead of driving real Chrome. For CI / Docker without a Chrome binary. | +| `AUTOPAPERTOPPT_CORE_API_KEY` | OA resolver | Free key from . Enables the CORE.ac.uk lookup step in the OA PDF resolver. | +| `AUTOPAPERTOPPT_DISABLE_SCHOLAR_SCRAPING` | Google Scholar | `=1` स्क्रैपिंग सक्षम करता है। डिफ़ॉल्ट बंद — Scholar ToS स्क्रैपिंग निषेध। | | `AUTOPAPERTOPPT_PDF_COOKIES_FILE` | PDF डाउनलोडर | Netscape-स्वरूप `cookies.txt`। डिफ़ॉल्ट बंद। केवल उन प्रकाशकों के साथ उपयोग करें जिनके लिए आपके पास संस्थागत अधिकार हैं। | | `AUTOPAPERTOPPT_LOG_LEVEL` | logger | डिफ़ॉल्ट `INFO`; विस्तृत ट्रेस के लिए `DEBUG`। | diff --git a/readmes/README.id.md b/readmes/README.id.md index 2e4b0b9..3fa06ce 100644 --- a/readmes/README.id.md +++ b/readmes/README.id.md @@ -168,10 +168,13 @@ py -m autopapertoppt --paper "https://arxiv.org/abs/1706.03762" ` | `AUTOPAPERTOPPT_NCBI_API_KEY` | PubMed | Menaikkan limit anonim NCBI (3/s) ke 10/s. Opsional. | | `AUTOPAPERTOPPT_CONTACT_EMAIL` | PubMed, ACM, Crossref, OpenAlex | Menempatkan permintaan ke polite pool Crossref. | | `AUTOPAPERTOPPT_IEEE_API_KEY` | IEEE (jalur API) | API resmi IEEE Xplore; mengekspos `pdf_url` untuk makalah dalam cakupan. | -| `AUTOPAPERTOPPT_ENABLE_IEEE_SCRAPING` | IEEE (jalur scraping) | `=1` mengaktifkan scraping. Tidak perlu saat API key sudah diset. | +| `AUTOPAPERTOPPT_DISABLE_IEEE_SCRAPING` | IEEE (jalur scraping) | `=1` mengaktifkan scraping. Tidak perlu saat API key sudah diset. | | `AUTOPAPERTOPPT_CROSSREF_PLUS_TOKEN` | ACM, Crossref | Token pelanggan Crossref Plus (header Bearer). Opsional. | | `AUTOPAPERTOPPT_SPRINGER_API_KEY` | Springer | Wajib; kunci gratis di . Tanpa kunci, plugin dilewati diam-diam. | -| `AUTOPAPERTOPPT_ENABLE_SCHOLAR_SCRAPING` | Google Scholar | `=1` mengaktifkan scraping. Default mati — ToS Scholar melarang scraping. | +| `AUTOPAPERTOPPT_CHROME_PROFILE_DIR` | Scholar + IEEE + paywalled-PDF downloads | Persistent Chrome `--user-data-dir`. Set this and complete VPN / SSO once; subsequent runs inherit the cookies. | +| `AUTOPAPERTOPPT_DISABLE_WEBRUNNER` | Scholar + IEEE + paywalled-PDF downloads | `=1` forces the httpx paths instead of driving real Chrome. For CI / Docker without a Chrome binary. | +| `AUTOPAPERTOPPT_CORE_API_KEY` | OA resolver | Free key from . Enables the CORE.ac.uk lookup step in the OA PDF resolver. | +| `AUTOPAPERTOPPT_DISABLE_SCHOLAR_SCRAPING` | Google Scholar | `=1` mengaktifkan scraping. Default mati — ToS Scholar melarang scraping. | | `AUTOPAPERTOPPT_PDF_COOKIES_FILE` | Pengunduh PDF | `cookies.txt` format Netscape. Default mati. Gunakan hanya dengan penerbit yang Anda miliki hak institusi. | | `AUTOPAPERTOPPT_LOG_LEVEL` | logger | Default `INFO`; `DEBUG` untuk jejak verbose. | diff --git a/readmes/README.it.md b/readmes/README.it.md index fa1aa27..05e341c 100644 --- a/readmes/README.it.md +++ b/readmes/README.it.md @@ -168,10 +168,13 @@ py -m autopapertoppt --paper "https://arxiv.org/abs/1706.03762" ` | `AUTOPAPERTOPPT_NCBI_API_KEY` | PubMed | Alza il limite anonimo NCBI (3/s) a 10/s. Opzionale. | | `AUTOPAPERTOPPT_CONTACT_EMAIL` | PubMed, ACM, Crossref, OpenAlex | Mette le richieste nel polite pool di Crossref. | | `AUTOPAPERTOPPT_IEEE_API_KEY` | IEEE (via API) | API ufficiale IEEE Xplore; espone `pdf_url` per articoli coperti. | -| `AUTOPAPERTOPPT_ENABLE_IEEE_SCRAPING` | IEEE (via scraping) | `=1` abilita scraping. Non serve se la API key è impostata. | +| `AUTOPAPERTOPPT_DISABLE_IEEE_SCRAPING` | IEEE (via scraping) | `=1` abilita scraping. Non serve se la API key è impostata. | | `AUTOPAPERTOPPT_CROSSREF_PLUS_TOKEN` | ACM, Crossref | Token abbonato Crossref Plus (header Bearer). Opzionale. | | `AUTOPAPERTOPPT_SPRINGER_API_KEY` | Springer | Obbligatoria; chiave gratuita su . Senza chiave il plugin viene saltato silenziosamente. | -| `AUTOPAPERTOPPT_ENABLE_SCHOLAR_SCRAPING` | Google Scholar | `=1` abilita scraping. Default disabilitato — i ToS di Scholar lo vietano. | +| `AUTOPAPERTOPPT_CHROME_PROFILE_DIR` | Scholar + IEEE + paywalled-PDF downloads | Persistent Chrome `--user-data-dir`. Set this and complete VPN / SSO once; subsequent runs inherit the cookies. | +| `AUTOPAPERTOPPT_DISABLE_WEBRUNNER` | Scholar + IEEE + paywalled-PDF downloads | `=1` forces the httpx paths instead of driving real Chrome. For CI / Docker without a Chrome binary. | +| `AUTOPAPERTOPPT_CORE_API_KEY` | OA resolver | Free key from . Enables the CORE.ac.uk lookup step in the OA PDF resolver. | +| `AUTOPAPERTOPPT_DISABLE_SCHOLAR_SCRAPING` | Google Scholar | `=1` abilita scraping. Default disabilitato — i ToS di Scholar lo vietano. | | `AUTOPAPERTOPPT_PDF_COOKIES_FILE` | Downloader PDF | `cookies.txt` formato Netscape. Default disabilitato. Usa solo con editori per cui hai diritti istituzionali. | | `AUTOPAPERTOPPT_LOG_LEVEL` | logger | `INFO` di default; `DEBUG` per trace verbose. | diff --git a/readmes/README.ja.md b/readmes/README.ja.md index b29f985..2594a74 100644 --- a/readmes/README.ja.md +++ b/readmes/README.ja.md @@ -168,10 +168,13 @@ py -m autopapertoppt --paper "https://arxiv.org/abs/1706.03762" ` | `AUTOPAPERTOPPT_NCBI_API_KEY` | PubMed | NCBI の匿名上限(3/s)を 10/s に引き上げ。任意。 | | `AUTOPAPERTOPPT_CONTACT_EMAIL` | PubMed、ACM、Crossref、OpenAlex | リクエストを Crossref の polite プールに入れる。 | | `AUTOPAPERTOPPT_IEEE_API_KEY` | IEEE(API パス) | 公式 IEEE Xplore API、対象論文の `pdf_url` を公開。 | -| `AUTOPAPERTOPPT_ENABLE_IEEE_SCRAPING` | IEEE(スクレイピングパス) | `=1` でスクレイピング有効。API キー設定時は不要。 | +| `AUTOPAPERTOPPT_DISABLE_IEEE_SCRAPING` | IEEE(スクレイピングパス) | `=1` でスクレイピング有効。API キー設定時は不要。 | | `AUTOPAPERTOPPT_CROSSREF_PLUS_TOKEN` | ACM、Crossref | Crossref Plus 加入者トークン(Bearer ヘッダ)。任意。 | | `AUTOPAPERTOPPT_SPRINGER_API_KEY` | Springer | 必須。 から無料キー。未設定だとプラグインは沈黙してスキップされる。 | -| `AUTOPAPERTOPPT_ENABLE_SCHOLAR_SCRAPING` | Google Scholar | `=1` でスクレイピング有効。デフォルト無効 — Scholar ToS がスクレイピングを禁止。 | +| `AUTOPAPERTOPPT_CHROME_PROFILE_DIR` | Scholar + IEEE + paywalled-PDF downloads | Persistent Chrome `--user-data-dir`. Set this and complete VPN / SSO once; subsequent runs inherit the cookies. | +| `AUTOPAPERTOPPT_DISABLE_WEBRUNNER` | Scholar + IEEE + paywalled-PDF downloads | `=1` forces the httpx paths instead of driving real Chrome. For CI / Docker without a Chrome binary. | +| `AUTOPAPERTOPPT_CORE_API_KEY` | OA resolver | Free key from . Enables the CORE.ac.uk lookup step in the OA PDF resolver. | +| `AUTOPAPERTOPPT_DISABLE_SCHOLAR_SCRAPING` | Google Scholar | `=1` でスクレイピング有効。デフォルト無効 — Scholar ToS がスクレイピングを禁止。 | | `AUTOPAPERTOPPT_PDF_COOKIES_FILE` | PDF ダウンローダ | Netscape 形式 `cookies.txt`。デフォルト無効。所属機関アクセス権を持つ出版社にのみ使用してください。 | | `AUTOPAPERTOPPT_LOG_LEVEL` | logger | デフォルト `INFO`、詳細ログは `DEBUG`。 | diff --git a/readmes/README.ko.md b/readmes/README.ko.md index 0e7e513..333de2f 100644 --- a/readmes/README.ko.md +++ b/readmes/README.ko.md @@ -168,10 +168,13 @@ py -m autopapertoppt --paper "https://arxiv.org/abs/1706.03762" ` | `AUTOPAPERTOPPT_NCBI_API_KEY` | PubMed | NCBI 익명 한도 (3/s) 를 10/s 로 상향. 선택. | | `AUTOPAPERTOPPT_CONTACT_EMAIL` | PubMed, ACM, Crossref, OpenAlex | 요청을 Crossref polite pool 에 넣음. | | `AUTOPAPERTOPPT_IEEE_API_KEY` | IEEE (API 경로) | 공식 IEEE Xplore API; 구독 범위 논문에 `pdf_url` 노출. | -| `AUTOPAPERTOPPT_ENABLE_IEEE_SCRAPING` | IEEE (스크래핑 경로) | `=1` 로 스크래핑 활성. API 키가 설정된 경우 불필요. | +| `AUTOPAPERTOPPT_DISABLE_IEEE_SCRAPING` | IEEE (스크래핑 경로) | `=1` 로 스크래핑 활성. API 키가 설정된 경우 불필요. | | `AUTOPAPERTOPPT_CROSSREF_PLUS_TOKEN` | ACM, Crossref | Crossref Plus 구독자 토큰 (Bearer 헤더). 선택. | | `AUTOPAPERTOPPT_SPRINGER_API_KEY` | Springer | 필수; 에서 무료 키. 없으면 플러그인이 조용히 건너뜀. | -| `AUTOPAPERTOPPT_ENABLE_SCHOLAR_SCRAPING` | Google Scholar | `=1` 로 스크래핑 활성. 기본 비활성 — Scholar ToS 가 스크래핑 금지. | +| `AUTOPAPERTOPPT_CHROME_PROFILE_DIR` | Scholar + IEEE + paywalled-PDF downloads | Persistent Chrome `--user-data-dir`. Set this and complete VPN / SSO once; subsequent runs inherit the cookies. | +| `AUTOPAPERTOPPT_DISABLE_WEBRUNNER` | Scholar + IEEE + paywalled-PDF downloads | `=1` forces the httpx paths instead of driving real Chrome. For CI / Docker without a Chrome binary. | +| `AUTOPAPERTOPPT_CORE_API_KEY` | OA resolver | Free key from . Enables the CORE.ac.uk lookup step in the OA PDF resolver. | +| `AUTOPAPERTOPPT_DISABLE_SCHOLAR_SCRAPING` | Google Scholar | `=1` 로 스크래핑 활성. 기본 비활성 — Scholar ToS 가 스크래핑 금지. | | `AUTOPAPERTOPPT_PDF_COOKIES_FILE` | PDF 다운로더 | Netscape 형식 `cookies.txt`. 기본 비활성. 기관 접근 권한이 있는 출판사에만 사용하세요. | | `AUTOPAPERTOPPT_LOG_LEVEL` | logger | 기본 `INFO`; verbose 추적은 `DEBUG`. | diff --git a/readmes/README.pt.md b/readmes/README.pt.md index a7e2d97..5338877 100644 --- a/readmes/README.pt.md +++ b/readmes/README.pt.md @@ -168,10 +168,13 @@ py -m autopapertoppt --paper "https://arxiv.org/abs/1706.03762" ` | `AUTOPAPERTOPPT_NCBI_API_KEY` | PubMed | Eleva o limite anônimo do NCBI (3/s) para 10/s. Opcional. | | `AUTOPAPERTOPPT_CONTACT_EMAIL` | PubMed, ACM, Crossref, OpenAlex | Coloca requisições no polite pool do Crossref. | | `AUTOPAPERTOPPT_IEEE_API_KEY` | IEEE (rota API) | API oficial IEEE Xplore; expõe `pdf_url` para artigos no escopo. | -| `AUTOPAPERTOPPT_ENABLE_IEEE_SCRAPING` | IEEE (rota scraping) | `=1` ativa scraping. Desnecessário quando a chave API está configurada. | +| `AUTOPAPERTOPPT_DISABLE_IEEE_SCRAPING` | IEEE (rota scraping) | `=1` ativa scraping. Desnecessário quando a chave API está configurada. | | `AUTOPAPERTOPPT_CROSSREF_PLUS_TOKEN` | ACM, Crossref | Token de assinante Crossref Plus (header Bearer). Opcional. | | `AUTOPAPERTOPPT_SPRINGER_API_KEY` | Springer | Obrigatório; chave gratuita em . Plugin é silenciosamente pulado sem ele. | -| `AUTOPAPERTOPPT_ENABLE_SCHOLAR_SCRAPING` | Google Scholar | `=1` ativa scraping. Padrão desligado — ToS do Scholar proíbe scraping. | +| `AUTOPAPERTOPPT_CHROME_PROFILE_DIR` | Scholar + IEEE + paywalled-PDF downloads | Persistent Chrome `--user-data-dir`. Set this and complete VPN / SSO once; subsequent runs inherit the cookies. | +| `AUTOPAPERTOPPT_DISABLE_WEBRUNNER` | Scholar + IEEE + paywalled-PDF downloads | `=1` forces the httpx paths instead of driving real Chrome. For CI / Docker without a Chrome binary. | +| `AUTOPAPERTOPPT_CORE_API_KEY` | OA resolver | Free key from . Enables the CORE.ac.uk lookup step in the OA PDF resolver. | +| `AUTOPAPERTOPPT_DISABLE_SCHOLAR_SCRAPING` | Google Scholar | `=1` ativa scraping. Padrão desligado — ToS do Scholar proíbe scraping. | | `AUTOPAPERTOPPT_PDF_COOKIES_FILE` | Downloader PDF | `cookies.txt` formato Netscape. Padrão desligado. Use apenas com editoras nas quais você tem direitos institucionais. | | `AUTOPAPERTOPPT_LOG_LEVEL` | logger | `INFO` padrão; `DEBUG` para rastreamento verboso. | diff --git a/readmes/README.ru.md b/readmes/README.ru.md index a482a9f..11e9a47 100644 --- a/readmes/README.ru.md +++ b/readmes/README.ru.md @@ -168,10 +168,13 @@ py -m autopapertoppt --paper "https://arxiv.org/abs/1706.03762" ` | `AUTOPAPERTOPPT_NCBI_API_KEY` | PubMed | Поднимает анонимный лимит NCBI (3/с) до 10/с. Опционально. | | `AUTOPAPERTOPPT_CONTACT_EMAIL` | PubMed, ACM, Crossref, OpenAlex | Помещает запросы в polite pool Crossref. | | `AUTOPAPERTOPPT_IEEE_API_KEY` | IEEE (API-путь) | Официальное API IEEE Xplore; выдаёт `pdf_url` для статей в области подписки. | -| `AUTOPAPERTOPPT_ENABLE_IEEE_SCRAPING` | IEEE (scraping-путь) | `=1` включает scraping. Не нужна, когда задан API-ключ. | +| `AUTOPAPERTOPPT_DISABLE_IEEE_SCRAPING` | IEEE (scraping-путь) | `=1` включает scraping. Не нужна, когда задан API-ключ. | | `AUTOPAPERTOPPT_CROSSREF_PLUS_TOKEN` | ACM, Crossref | Токен подписчика Crossref Plus (Bearer-заголовок). Опционально. | | `AUTOPAPERTOPPT_SPRINGER_API_KEY` | Springer | Обязательна; бесплатный ключ на . Без неё плагин тихо пропускается. | -| `AUTOPAPERTOPPT_ENABLE_SCHOLAR_SCRAPING` | Google Scholar | `=1` включает scraping. По умолчанию выключен — ToS Scholar запрещают scraping. | +| `AUTOPAPERTOPPT_CHROME_PROFILE_DIR` | Scholar + IEEE + paywalled-PDF downloads | Persistent Chrome `--user-data-dir`. Set this and complete VPN / SSO once; subsequent runs inherit the cookies. | +| `AUTOPAPERTOPPT_DISABLE_WEBRUNNER` | Scholar + IEEE + paywalled-PDF downloads | `=1` forces the httpx paths instead of driving real Chrome. For CI / Docker without a Chrome binary. | +| `AUTOPAPERTOPPT_CORE_API_KEY` | OA resolver | Free key from . Enables the CORE.ac.uk lookup step in the OA PDF resolver. | +| `AUTOPAPERTOPPT_DISABLE_SCHOLAR_SCRAPING` | Google Scholar | `=1` включает scraping. По умолчанию выключен — ToS Scholar запрещают scraping. | | `AUTOPAPERTOPPT_PDF_COOKIES_FILE` | PDF-загрузчик | `cookies.txt` в формате Netscape. По умолчанию выключен. Используйте только с теми издателями, к которым имеете институциональные права. | | `AUTOPAPERTOPPT_LOG_LEVEL` | logger | `INFO` по умолчанию; `DEBUG` для подробных трасс. | diff --git a/readmes/README.vi.md b/readmes/README.vi.md index 2f367eb..f994a48 100644 --- a/readmes/README.vi.md +++ b/readmes/README.vi.md @@ -168,10 +168,13 @@ py -m autopapertoppt --paper "https://arxiv.org/abs/1706.03762" ` | `AUTOPAPERTOPPT_NCBI_API_KEY` | PubMed | Nâng giới hạn ẩn danh NCBI (3/s) lên 10/s. Tùy chọn. | | `AUTOPAPERTOPPT_CONTACT_EMAIL` | PubMed, ACM, Crossref, OpenAlex | Đưa request vào polite pool của Crossref. | | `AUTOPAPERTOPPT_IEEE_API_KEY` | IEEE (đường API) | API chính thức IEEE Xplore; phơi `pdf_url` cho bài thuộc phạm vi đăng ký. | -| `AUTOPAPERTOPPT_ENABLE_IEEE_SCRAPING` | IEEE (đường scraping) | `=1` bật scraping. Không cần khi đã có API key. | +| `AUTOPAPERTOPPT_DISABLE_IEEE_SCRAPING` | IEEE (đường scraping) | `=1` bật scraping. Không cần khi đã có API key. | | `AUTOPAPERTOPPT_CROSSREF_PLUS_TOKEN` | ACM, Crossref | Token thuê bao Crossref Plus (header Bearer). Tùy chọn. | | `AUTOPAPERTOPPT_SPRINGER_API_KEY` | Springer | Bắt buộc; key miễn phí tại . Plugin bị bỏ qua âm thầm nếu thiếu. | -| `AUTOPAPERTOPPT_ENABLE_SCHOLAR_SCRAPING` | Google Scholar | `=1` bật scraping. Mặc định tắt — ToS Scholar cấm scraping. | +| `AUTOPAPERTOPPT_CHROME_PROFILE_DIR` | Scholar + IEEE + paywalled-PDF downloads | Persistent Chrome `--user-data-dir`. Set this and complete VPN / SSO once; subsequent runs inherit the cookies. | +| `AUTOPAPERTOPPT_DISABLE_WEBRUNNER` | Scholar + IEEE + paywalled-PDF downloads | `=1` forces the httpx paths instead of driving real Chrome. For CI / Docker without a Chrome binary. | +| `AUTOPAPERTOPPT_CORE_API_KEY` | OA resolver | Free key from . Enables the CORE.ac.uk lookup step in the OA PDF resolver. | +| `AUTOPAPERTOPPT_DISABLE_SCHOLAR_SCRAPING` | Google Scholar | `=1` bật scraping. Mặc định tắt — ToS Scholar cấm scraping. | | `AUTOPAPERTOPPT_PDF_COOKIES_FILE` | Trình tải PDF | `cookies.txt` định dạng Netscape. Mặc định tắt. Chỉ dùng với nhà xuất bản bạn có quyền truy cập tổ chức. | | `AUTOPAPERTOPPT_LOG_LEVEL` | logger | Mặc định `INFO`; `DEBUG` cho trace chi tiết. | diff --git a/readmes/README.zh-CN.md b/readmes/README.zh-CN.md index 4f172c2..3c53835 100644 --- a/readmes/README.zh-CN.md +++ b/readmes/README.zh-CN.md @@ -186,10 +186,13 @@ py -m autopapertoppt --paper "https://arxiv.org/abs/1706.03762" ` | `AUTOPAPERTOPPT_NCBI_API_KEY` | PubMed | 把 NCBI 匿名限额(3/s)提到 10/s,选用。 | | `AUTOPAPERTOPPT_CONTACT_EMAIL` | PubMed、ACM、Crossref、OpenAlex | 让 Crossref 等把请求放进「礼貌池」。 | | `AUTOPAPERTOPPT_IEEE_API_KEY` | IEEE(API 路径) | 切换到官方 Xplore API,订阅范围内会带 `pdf_url`。 | -| `AUTOPAPERTOPPT_ENABLE_IEEE_SCRAPING` | IEEE(爬取路径) | 设 `=1` 才启用爬取。若已设 API key,此变量不需要。 | +| `AUTOPAPERTOPPT_DISABLE_IEEE_SCRAPING` | IEEE(爬取路径) | 设 `=1` 才启用爬取。若已设 API key,此变量不需要。 | | `AUTOPAPERTOPPT_CROSSREF_PLUS_TOKEN` | ACM、Crossref | Crossref Plus 订阅 token(Bearer header),选用。 | | `AUTOPAPERTOPPT_SPRINGER_API_KEY` | Springer | 必填;免费 key 申请 。没设则该 plugin 会被静默跳过。 | -| `AUTOPAPERTOPPT_ENABLE_SCHOLAR_SCRAPING` | Google Scholar | 设 `=1` 才启用。默认关闭(Scholar ToS 禁止爬取)。 | +| `AUTOPAPERTOPPT_CHROME_PROFILE_DIR` | Scholar + IEEE + paywalled-PDF downloads | Persistent Chrome `--user-data-dir`. Set this and complete VPN / SSO once; subsequent runs inherit the cookies. | +| `AUTOPAPERTOPPT_DISABLE_WEBRUNNER` | Scholar + IEEE + paywalled-PDF downloads | `=1` forces the httpx paths instead of driving real Chrome. For CI / Docker without a Chrome binary. | +| `AUTOPAPERTOPPT_CORE_API_KEY` | OA resolver | Free key from . Enables the CORE.ac.uk lookup step in the OA PDF resolver. | +| `AUTOPAPERTOPPT_DISABLE_SCHOLAR_SCRAPING` | Google Scholar | 设 `=1` 才启用。默认关闭(Scholar ToS 禁止爬取)。 | | `AUTOPAPERTOPPT_PDF_COOKIES_FILE` | PDF 下载器 | Netscape `cookies.txt`,默认关闭。请只用在你有合法访问权的出版商。 | | `AUTOPAPERTOPPT_LOG_LEVEL` | logger | 默认 `INFO`;`DEBUG` 可看更详细。 | diff --git a/readmes/README.zh-TW.md b/readmes/README.zh-TW.md index dadfe74..5c40873 100644 --- a/readmes/README.zh-TW.md +++ b/readmes/README.zh-TW.md @@ -186,10 +186,13 @@ py -m autopapertoppt --paper "https://arxiv.org/abs/1706.03762" ` | `AUTOPAPERTOPPT_NCBI_API_KEY` | PubMed | 把 NCBI 匿名限額(3/s)拉到 10/s,選用。 | | `AUTOPAPERTOPPT_CONTACT_EMAIL` | PubMed、ACM、Crossref、OpenAlex | 讓 Crossref 等把請求放進「客氣池」。 | | `AUTOPAPERTOPPT_IEEE_API_KEY` | IEEE(API 路徑) | 切換到官方 Xplore API,訂閱範圍內會帶 `pdf_url`。 | -| `AUTOPAPERTOPPT_ENABLE_IEEE_SCRAPING` | IEEE(爬取路徑) | 設 `=1` 才會啟用爬取。若已設 API key,此變數不需要。 | +| `AUTOPAPERTOPPT_DISABLE_IEEE_SCRAPING` | IEEE(爬取路徑) | 設 `=1` 才會啟用爬取。若已設 API key,此變數不需要。 | | `AUTOPAPERTOPPT_CROSSREF_PLUS_TOKEN` | ACM、Crossref | Crossref Plus 訂閱 token(Bearer header),選用。 | | `AUTOPAPERTOPPT_SPRINGER_API_KEY` | Springer | 必填;免費 key 申請 。沒設則該 plugin 會被靜默跳過。 | -| `AUTOPAPERTOPPT_ENABLE_SCHOLAR_SCRAPING` | Google Scholar | 設 `=1` 才會啟用。預設關閉(Scholar ToS 禁止爬取)。 | +| `AUTOPAPERTOPPT_CHROME_PROFILE_DIR` | Scholar + IEEE + paywalled-PDF downloads | Persistent Chrome `--user-data-dir`. Set this and complete VPN / SSO once; subsequent runs inherit the cookies. | +| `AUTOPAPERTOPPT_DISABLE_WEBRUNNER` | Scholar + IEEE + paywalled-PDF downloads | `=1` forces the httpx paths instead of driving real Chrome. For CI / Docker without a Chrome binary. | +| `AUTOPAPERTOPPT_CORE_API_KEY` | OA resolver | Free key from . Enables the CORE.ac.uk lookup step in the OA PDF resolver. | +| `AUTOPAPERTOPPT_DISABLE_SCHOLAR_SCRAPING` | Google Scholar | 設 `=1` 才會啟用。預設關閉(Scholar ToS 禁止爬取)。 | | `AUTOPAPERTOPPT_PDF_COOKIES_FILE` | PDF 下載器 | Netscape `cookies.txt`,預設關閉。請只用在你有合法存取權的出版商。 | | `AUTOPAPERTOPPT_LOG_LEVEL` | logger | 預設 `INFO`;`DEBUG` 可看更詳細。 | diff --git a/scripts/_update_readme_envvars.py b/scripts/_update_readme_envvars.py new file mode 100644 index 0000000..d3f2fc0 --- /dev/null +++ b/scripts/_update_readme_envvars.py @@ -0,0 +1,100 @@ +"""Patch the 13 non-English READMEs to match the current env-var table. + +The translated READMEs duplicate the English README's "Environment variables" +section verbatim except for prose around each row, so the surgical edits are: + +1. ENABLE_IEEE_SCRAPING row -> DISABLE_IEEE_SCRAPING (opt-out, default-on). +2. ENABLE_SCHOLAR_SCRAPING row -> DISABLE_SCHOLAR_SCRAPING (same). +3. Append three NEW rows (CHROME_PROFILE_DIR, DISABLE_WEBRUNNER, CORE_API_KEY) + after the SPRINGER_API_KEY row in each language file. + +The purpose text in the new rows is English — translation of every nuance into +13 languages is out of scope; the variable name + a short English description +is enough for users who know the project. Adopt-as-you-can in future PRs. + +Usage: + .venv\\Scripts\\python.exe -m scripts._update_readme_envvars +""" +from __future__ import annotations + +import re +from pathlib import Path + +REPO = Path(__file__).resolve().parents[1] +READMES_DIR = REPO / "readmes" + +# (variable name, English purpose text). Same as README.md. +_DISABLE_IEEE = ( + "AUTOPAPERTOPPT_DISABLE_IEEE_SCRAPING", + "**IEEE is default-ON via visible Chrome.** Set `=1` to opt out (e.g. CI without Chrome).", +) +_DISABLE_SCHOLAR = ( + "AUTOPAPERTOPPT_DISABLE_SCHOLAR_SCRAPING", + "**Scholar is default-ON via visible Chrome.** Set `=1` to opt out (Google ToS forbids automation).", +) +_NEW_ROWS = ( + ( + "AUTOPAPERTOPPT_CHROME_PROFILE_DIR", + "Scholar + IEEE + paywalled-PDF downloads", + "Persistent Chrome `--user-data-dir`. Set this and complete VPN / SSO once; subsequent runs inherit the cookies.", + ), + ( + "AUTOPAPERTOPPT_DISABLE_WEBRUNNER", + "Scholar + IEEE + paywalled-PDF downloads", + "`=1` forces the httpx paths instead of driving real Chrome. For CI / Docker without a Chrome binary.", + ), + ( + "AUTOPAPERTOPPT_CORE_API_KEY", + "OA resolver", + "Free key from . Enables the CORE.ac.uk lookup step in the OA PDF resolver.", + ), +) + + +def _patch(text: str) -> str: + # Flip ENABLE_IEEE -> DISABLE_IEEE (variable name only). Leave the + # localized purpose column intact except for the variable name. + text = re.sub( + r"`AUTOPAPERTOPPT_ENABLE_IEEE_SCRAPING`", + f"`{_DISABLE_IEEE[0]}`", + text, + ) + text = re.sub( + r"`AUTOPAPERTOPPT_ENABLE_SCHOLAR_SCRAPING`", + f"`{_DISABLE_SCHOLAR[0]}`", + text, + ) + # Locate the Springer row and inject the new rows directly after it. + pattern = re.compile( + r"(\| `AUTOPAPERTOPPT_SPRINGER_API_KEY` \|[^\n]*\n)", + re.MULTILINE, + ) + + def insert(match: re.Match[str]) -> str: + rows = [match.group(1)] + for env, used_by, purpose in _NEW_ROWS: + # Skip if a row with this variable already exists in the file. + if f"`{env}`" in text: + continue + rows.append(f"| `{env}` | {used_by} | {purpose} |\n") + return "".join(rows) + + text = pattern.sub(insert, text, count=1) + return text + + +def main() -> int: + changed = 0 + for path in sorted(READMES_DIR.glob("README.*.md")): + before = path.read_text(encoding="utf-8") + after = _patch(before) + if after != before: + path.write_text(after, encoding="utf-8") + changed += 1 + print(f"updated {path.name}") + print(f"{changed} file(s) changed") + return 0 + + +if __name__ == "__main__": + main()