| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394 |
- #!/usr/bin/env python3
- """
- 从 Wikimedia Commons 抓真实图片(公共领域 / CC),供 huashu-design「内容型设计取真图」用(Phase 3.5)。
- 为什么有这个脚本:内容型设计(鹦鹉/咖啡/马来西亚…)必须用真图,不能 CSS 色块糊弄。
- 每次让模型现写抓图逻辑既慢又容易漏坑(忘清代理→TLS 炸 / 忘合规 UA→429)。这里固化好,下次只改关键词。
- 用法:
- python3 scripts/fetch_images.py --query "Petronas Towers" "Langkawi beach" "George Town street" \
- --out 项目/assets/img --count 2 --width 1600
- 每个 query 取前 count 张、缩放到 width、下载到 out,并打印清单(路径 | 许可 | 作者 | 来源页)便于诚实性核对。
- 全部抓不到 → 退出码 1,提示走 Phase 3.5 取图三级兜底(Unsplash/Pexels → 生图 → 诚实 placeholder)。
- """
- import argparse, json, os, re, sys, urllib.parse, urllib.request
- # ① 清代理:本机 curl/urllib 走代理会 TLS 炸(见 memory feedback_gemini_proxy)
- for _k in ("ALL_PROXY", "all_proxy", "HTTP_PROXY", "http_proxy", "HTTPS_PROXY", "https_proxy"):
- os.environ.pop(_k, None)
- API = "https://commons.wikimedia.org/w/api.php"
- # ② 合规 User-Agent 是硬性要求,否则 Wikimedia 返 429
- UA = "huashu-design-image-fetcher/1.0 (https://huasheng.ai; skill contact)"
- def _api_get(params):
- url = API + "?" + urllib.parse.urlencode(params)
- req = urllib.request.Request(url, headers={"User-Agent": UA})
- with urllib.request.urlopen(req, timeout=30) as r:
- return json.load(r)
- def _safe(name):
- return re.sub(r"[^\w\-.]", "_", name)[:60]
- def fetch(query, out, count, width):
- params = {
- "action": "query", "format": "json", "generator": "search",
- "gsrsearch": query, "gsrnamespace": 6, "gsrlimit": count,
- "prop": "imageinfo", "iiprop": "url|extmetadata", "iiurlwidth": width,
- }
- try:
- data = _api_get(params)
- except Exception as e:
- print(f"[FAIL search] {query}: {e}", file=sys.stderr)
- return []
- pages = (data.get("query", {}) or {}).get("pages", {})
- got = []
- for p in list(pages.values())[:count]:
- ii = (p.get("imageinfo") or [{}])[0]
- thumb = ii.get("thumburl") or ii.get("url")
- if not thumb:
- continue
- meta = ii.get("extmetadata", {}) or {}
- lic = (meta.get("LicenseShortName", {}) or {}).get("value", "?")
- artist = re.sub("<[^>]+>", "", (meta.get("Artist", {}) or {}).get("value", "?")).strip()
- ext = os.path.splitext(thumb)[1].split("?")[0] or ".jpg"
- fn = _safe(query) + "_" + _safe(p.get("title", "img").replace("File:", ""))
- fn = os.path.splitext(fn)[0][:55] + ext
- path = os.path.join(out, fn)
- try:
- req = urllib.request.Request(thumb, headers={"User-Agent": UA})
- with urllib.request.urlopen(req, timeout=60) as r, open(path, "wb") as f:
- f.write(r.read())
- got.append(path)
- print(f"[OK] {path} | {lic} | {artist} | {ii.get('descriptionurl','')}")
- except Exception as e:
- print(f"[FAIL dl] {thumb}: {e}", file=sys.stderr)
- if not got:
- print(f"[EMPTY] 「{query}」没抓到——换关键词,或走 Phase 3.5 兜底", file=sys.stderr)
- return got
- def main():
- ap = argparse.ArgumentParser(description="Wikimedia Commons 真图抓取(huashu-design Phase 3.5)")
- ap.add_argument("--query", nargs="+", required=True, help="一个或多个英文关键词(英文命中率高)")
- ap.add_argument("--out", required=True, help="输出目录(建议 项目/assets/img)")
- ap.add_argument("--count", type=int, default=2, help="每个关键词抓几张(默认 2)")
- ap.add_argument("--width", type=int, default=1600, help="缩放宽度 px(默认 1600)")
- a = ap.parse_args()
- os.makedirs(a.out, exist_ok=True)
- allgot = []
- for q in a.query:
- allgot += fetch(q, a.out, a.count, a.width)
- print(f"\n=== 共下载 {len(allgot)} 张到 {a.out} ===")
- print("⚠️ 诚实性核对:去掉每张图信息是否有损?许可是否允许用途?不合适的删掉。")
- if not allgot:
- print("❌ 全部失败 → 走 Phase 3.5 取图三级兜底(Unsplash/Pexels → 生图 → 诚实 placeholder,不卡流程)", file=sys.stderr)
- sys.exit(1)
- if __name__ == "__main__":
- main()
|