1
0

fetch_images.py 4.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. #!/usr/bin/env python3
  2. """
  3. 从 Wikimedia Commons 抓真实图片(公共领域 / CC),供 huashu-design「内容型设计取真图」用(Phase 3.5)。
  4. 为什么有这个脚本:内容型设计(鹦鹉/咖啡/马来西亚…)必须用真图,不能 CSS 色块糊弄。
  5. 每次让模型现写抓图逻辑既慢又容易漏坑(忘清代理→TLS 炸 / 忘合规 UA→429)。这里固化好,下次只改关键词。
  6. 用法:
  7. python3 scripts/fetch_images.py --query "Petronas Towers" "Langkawi beach" "George Town street" \
  8. --out 项目/assets/img --count 2 --width 1600
  9. 每个 query 取前 count 张、缩放到 width、下载到 out,并打印清单(路径 | 许可 | 作者 | 来源页)便于诚实性核对。
  10. 全部抓不到 → 退出码 1,提示走 Phase 3.5 取图三级兜底(Unsplash/Pexels → 生图 → 诚实 placeholder)。
  11. """
  12. import argparse, json, os, re, sys, urllib.parse, urllib.request
  13. # ① 清代理:本机 curl/urllib 走代理会 TLS 炸(见 memory feedback_gemini_proxy)
  14. for _k in ("ALL_PROXY", "all_proxy", "HTTP_PROXY", "http_proxy", "HTTPS_PROXY", "https_proxy"):
  15. os.environ.pop(_k, None)
  16. API = "https://commons.wikimedia.org/w/api.php"
  17. # ② 合规 User-Agent 是硬性要求,否则 Wikimedia 返 429
  18. UA = "huashu-design-image-fetcher/1.0 (https://huasheng.ai; skill contact)"
  19. def _api_get(params):
  20. url = API + "?" + urllib.parse.urlencode(params)
  21. req = urllib.request.Request(url, headers={"User-Agent": UA})
  22. with urllib.request.urlopen(req, timeout=30) as r:
  23. return json.load(r)
  24. def _safe(name):
  25. return re.sub(r"[^\w\-.]", "_", name)[:60]
  26. def fetch(query, out, count, width):
  27. params = {
  28. "action": "query", "format": "json", "generator": "search",
  29. "gsrsearch": query, "gsrnamespace": 6, "gsrlimit": count,
  30. "prop": "imageinfo", "iiprop": "url|extmetadata", "iiurlwidth": width,
  31. }
  32. try:
  33. data = _api_get(params)
  34. except Exception as e:
  35. print(f"[FAIL search] {query}: {e}", file=sys.stderr)
  36. return []
  37. pages = (data.get("query", {}) or {}).get("pages", {})
  38. got = []
  39. for p in list(pages.values())[:count]:
  40. ii = (p.get("imageinfo") or [{}])[0]
  41. thumb = ii.get("thumburl") or ii.get("url")
  42. if not thumb:
  43. continue
  44. meta = ii.get("extmetadata", {}) or {}
  45. lic = (meta.get("LicenseShortName", {}) or {}).get("value", "?")
  46. artist = re.sub("<[^>]+>", "", (meta.get("Artist", {}) or {}).get("value", "?")).strip()
  47. ext = os.path.splitext(thumb)[1].split("?")[0] or ".jpg"
  48. fn = _safe(query) + "_" + _safe(p.get("title", "img").replace("File:", ""))
  49. fn = os.path.splitext(fn)[0][:55] + ext
  50. path = os.path.join(out, fn)
  51. try:
  52. req = urllib.request.Request(thumb, headers={"User-Agent": UA})
  53. with urllib.request.urlopen(req, timeout=60) as r, open(path, "wb") as f:
  54. f.write(r.read())
  55. got.append(path)
  56. print(f"[OK] {path} | {lic} | {artist} | {ii.get('descriptionurl','')}")
  57. except Exception as e:
  58. print(f"[FAIL dl] {thumb}: {e}", file=sys.stderr)
  59. if not got:
  60. print(f"[EMPTY] 「{query}」没抓到——换关键词,或走 Phase 3.5 兜底", file=sys.stderr)
  61. return got
  62. def main():
  63. ap = argparse.ArgumentParser(description="Wikimedia Commons 真图抓取(huashu-design Phase 3.5)")
  64. ap.add_argument("--query", nargs="+", required=True, help="一个或多个英文关键词(英文命中率高)")
  65. ap.add_argument("--out", required=True, help="输出目录(建议 项目/assets/img)")
  66. ap.add_argument("--count", type=int, default=2, help="每个关键词抓几张(默认 2)")
  67. ap.add_argument("--width", type=int, default=1600, help="缩放宽度 px(默认 1600)")
  68. a = ap.parse_args()
  69. os.makedirs(a.out, exist_ok=True)
  70. allgot = []
  71. for q in a.query:
  72. allgot += fetch(q, a.out, a.count, a.width)
  73. print(f"\n=== 共下载 {len(allgot)} 张到 {a.out} ===")
  74. print("⚠️ 诚实性核对:去掉每张图信息是否有损?许可是否允许用途?不合适的删掉。")
  75. if not allgot:
  76. print("❌ 全部失败 → 走 Phase 3.5 取图三级兜底(Unsplash/Pexels → 生图 → 诚实 placeholder,不卡流程)", file=sys.stderr)
  77. sys.exit(1)
  78. if __name__ == "__main__":
  79. main()