diff --git a/firstdata/sources/academic/ai-ml/README.md b/firstdata/sources/academic/ai-ml/README.md new file mode 100644 index 0000000..48072c6 --- /dev/null +++ b/firstdata/sources/academic/ai-ml/README.md @@ -0,0 +1,135 @@ +# 人工智能与机器学习 | AI & Machine Learning + +**总数**: 15+个数据源 +**已完成**: 12个 +**进度**: 80% + +--- + +## 📊 总体进度 + +``` +总目标: 15+ 个高质量 AI/ML 数据源 +当前完成: 12 个 +完成度: ████████░░ 80% +``` + +--- + +## 📚 已收录数据源 + +### 🗂️ 数据集平台 (4个) + +#### Hugging Face Datasets +- **文件**: [huggingface-datasets.json](huggingface-datasets.json) ⭐💎 +- **权威等级**: industry +- **类型**: 数据集平台、模型训练数据、基准测试 +- **涵盖**: 全球,2020-至今,100,000+数据集 +- **更新频率**: 持续 +- **特色**: AI/ML社区事实标准,支持NLP、CV、音频、多模态 + +#### Kaggle Datasets +- **文件**: [kaggle-datasets.json](kaggle-datasets.json) ⭐💎 +- **权威等级**: industry +- **类型**: 数据集、竞赛、Notebooks、模型 +- **涵盖**: 全球,2010-至今,200,000+数据集 +- **更新频率**: 持续 +- **特色**: Google 旗下,1500万+用户,数据科学社区标准 + +#### UCI Machine Learning Repository +- **文件**: [uci-ml-repository.json](uci-ml-repository.json) ⭐💎 +- **权威等级**: research +- **类型**: 数据集、基准 +- **涵盖**: 全球,1987-至今,600+数据集 +- **更新频率**: 月度 +- **特色**: 30+年历史,10万+论文引用,ML基准黄金标准 + +#### OpenML +- **文件**: [openml.json](openml.json) ⭐💎 +- **权威等级**: research +- **类型**: 数据集、基准、实验、ML流程 +- **涵盖**: 全球,2013-至今,5,000+数据集,10M+实验 +- **更新频率**: 持续 +- **特色**: 可重复研究,scikit-learn集成,EU资助 + +### 🔍 搜索与发现 (2个) + +#### Google Dataset Search +- **文件**: [google-dataset-search.json](google-dataset-search.json) ⭐💎 +- **权威等级**: industry +- **类型**: 搜索引擎、数据集、元数据 +- **涵盖**: 全球,2018-至今,2500万+索引数据集 +- **更新频率**: 持续 +- **特色**: Google Research产品,跨平台发现 + +#### Papers With Code +- **文件**: [papers-with-code.json](papers-with-code.json) ⭐💎 +- **权威等级**: research +- **类型**: 论文、代码、数据集、基准、SOTA结果 +- **涵盖**: 全球,2012-至今,300,000+论文,6,000+基准 +- **更新频率**: 每日 +- **特色**: 追踪各任务SOTA,Meta AI收购,研究社区标准参考 + +### ☁️ 云端大规模数据 (2个) + +#### AWS Registry of Open Data +- **文件**: [aws-open-data.json](aws-open-data.json) ⭐💎 +- **权威等级**: industry +- **类型**: 数据集、卫星、基因组、气候、地理空间 +- **涵盖**: 全球,2017-至今,400+数据集,PB级 +- **更新频率**: 持续 +- **特色**: 与 NASA、NOAA、NIH 合作,免费云端访问 + +#### Microsoft Research Open Data +- **文件**: [microsoft-research-open-data.json](microsoft-research-open-data.json) ⭐💎 +- **权威等级**: industry +- **类型**: 数据集、NLP、计算机视觉、社交网络 +- **涵盖**: 全球,2018-至今,100+数据集 +- **更新频率**: 季度 +- **特色**: 微软研究院官方,广泛引用的基准数据集 + +### 📄 学术论文与预印本 (2个) + +#### arXiv 计算机科学 +- **文件**: [arxiv-cs.json](arxiv-cs.json) ⭐💎 +- **权威等级**: research +- **类型**: 预印本论文、研究 +- **涵盖**: 全球,1991-至今,2.4M+论文 +- **更新频率**: 每日 +- **特色**: AI/ML研究首发平台,康奈尔大学运营 + +#### ACL Anthology +- **文件**: [acl-anthology.json](acl-anthology.json) ⭐💎 +- **权威等级**: research +- **类型**: 论文、会议录、数据集 +- **涵盖**: 全球,1965-至今,90,000+论文 +- **更新频率**: 持续 +- **特色**: NLP领域最权威,ACL/EMNLP/NAACL顶会论文 + +### 🏆 基准与标准 (1个) + +#### MLCommons +- **文件**: [mlcommons.json](mlcommons.json) ⭐💎 +- **权威等级**: industry +- **类型**: 基准、数据集、标准 +- **涵盖**: 全球,2018-至今,10+数据集,5+基准套件 +- **更新频率**: 半年 +- **特色**: MLPerf行业标准,Google/NVIDIA/Intel/Meta联合 + +### 🗄️ 数据存档 (1个) + +#### Zenodo (Machine Learning) +- **文件**: [zenodo-ml.json](zenodo-ml.json) ⭐💎 +- **权威等级**: research +- **类型**: 数据集、代码、论文、模型 +- **涵盖**: 全球,2013-至今,300万+记录 +- **更新频率**: 持续 +- **特色**: CERN运营,永久DOI,学术存档标准 + +--- + +## 🎯 待添加 + +- [ ] ImageNet (计算机视觉基准) +- [ ] Common Crawl (网页语料) +- [ ] The Pile (大型语言模型预训练) diff --git a/firstdata/sources/academic/ai-ml/acl-anthology.json b/firstdata/sources/academic/ai-ml/acl-anthology.json new file mode 100644 index 0000000..3130304 --- /dev/null +++ b/firstdata/sources/academic/ai-ml/acl-anthology.json @@ -0,0 +1,34 @@ +{ + "id": "acl-anthology", + "name": { + "zh": "ACL Anthology", + "en": "ACL Anthology" + }, + "description": { + "zh": "计算语言学协会论文库,NLP 领域最权威的学术论文档案,包含 ACL、EMNLP、NAACL 等顶会论文", + "en": "Association for Computational Linguistics paper archive, most authoritative academic papers in NLP including ACL, EMNLP, NAACL" + }, + "url": "https://aclanthology.org/", + "authority_level": "research", + "authority_justification": { + "zh": "计算语言学协会官方维护,NLP 领域所有顶级会议和期刊的唯一权威档案", + "en": "Officially maintained by ACL, the only authoritative archive for all top NLP conferences and journals" + }, + "data_type": ["papers", "proceedings", "datasets"], + "coverage": { + "geographic": "global", + "temporal": "1965-present", + "indicators": "90,000+ papers" + }, + "update_frequency": "continuous", + "access_method": { + "web": "https://aclanthology.org/", + "api": "https://aclanthology.org/anthology+abstracts.bib.gz", + "github": "https://github.com/acl-org/acl-anthology" + }, + "license": "CC BY 4.0", + "languages": ["en"], + "tags": ["nlp", "computational-linguistics", "papers", "conferences", "emnlp", "acl"], + "verified": true, + "last_verified": "2026-02-02" +} diff --git a/firstdata/sources/academic/ai-ml/arxiv-cs.json b/firstdata/sources/academic/ai-ml/arxiv-cs.json new file mode 100644 index 0000000..79699a5 --- /dev/null +++ b/firstdata/sources/academic/ai-ml/arxiv-cs.json @@ -0,0 +1,34 @@ +{ + "id": "arxiv-cs", + "name": { + "zh": "arXiv 计算机科学", + "en": "arXiv Computer Science" + }, + "description": { + "zh": "康奈尔大学运营的开放获取预印本服务器,是 AI/ML 研究的首发平台,每天数百篇新论文", + "en": "Open-access preprint server operated by Cornell University, the primary venue for AI/ML research with hundreds of new papers daily" + }, + "url": "https://arxiv.org/list/cs.AI/recent", + "authority_level": "research", + "authority_justification": { + "zh": "康奈尔大学运营,获得西蒙斯基金会等资助,是全球物理学和计算机科学的标准预印本平台", + "en": "Operated by Cornell University, funded by Simons Foundation, the standard preprint platform for physics and CS globally" + }, + "data_type": ["papers", "preprints", "research"], + "coverage": { + "geographic": "global", + "temporal": "1991-present", + "indicators": "2.4M+ papers, 50,000+ CS.AI papers" + }, + "update_frequency": "daily", + "access_method": { + "web": "https://arxiv.org", + "api": "https://info.arxiv.org/help/api/index.html", + "bulk": "https://info.arxiv.org/help/bulk_data.html" + }, + "license": "varies by paper (mostly CC BY)", + "languages": ["en"], + "tags": ["artificial-intelligence", "machine-learning", "deep-learning", "nlp", "computer-vision", "preprints"], + "verified": true, + "last_verified": "2026-02-02" +} diff --git a/firstdata/sources/academic/ai-ml/aws-open-data.json b/firstdata/sources/academic/ai-ml/aws-open-data.json new file mode 100644 index 0000000..aba3861 --- /dev/null +++ b/firstdata/sources/academic/ai-ml/aws-open-data.json @@ -0,0 +1,34 @@ +{ + "id": "aws-open-data", + "name": { + "zh": "AWS 开放数据注册表", + "en": "AWS Registry of Open Data" + }, + "description": { + "zh": "Amazon 托管的大规模公开数据集,包括卫星图像、基因组、气候等 PB 级数据", + "en": "Amazon-hosted large-scale public datasets including satellite imagery, genomics, climate data at PB scale" + }, + "url": "https://registry.opendata.aws/", + "authority_level": "industry", + "authority_justification": { + "zh": "AWS 官方维护,与 NASA、NOAA、NIH 等机构合作,提供免费云端访问", + "en": "Officially maintained by AWS, partnered with NASA, NOAA, NIH, provides free cloud access" + }, + "data_type": ["datasets", "satellite", "genomics", "climate", "geospatial"], + "coverage": { + "geographic": "global", + "temporal": "2017-present", + "indicators": "400+ datasets, PB scale" + }, + "update_frequency": "continuous", + "access_method": { + "web": "https://registry.opendata.aws/", + "aws_cli": "aws s3 ls s3://[dataset-bucket]", + "api": "S3 API" + }, + "license": "varies by dataset", + "languages": ["en"], + "tags": ["cloud", "satellite", "genomics", "climate", "geospatial", "big-data"], + "verified": true, + "last_verified": "2026-02-02" +} diff --git a/firstdata/sources/academic/ai-ml/google-dataset-search.json b/firstdata/sources/academic/ai-ml/google-dataset-search.json new file mode 100644 index 0000000..16529d7 --- /dev/null +++ b/firstdata/sources/academic/ai-ml/google-dataset-search.json @@ -0,0 +1,32 @@ +{ + "id": "google-dataset-search", + "name": { + "zh": "Google 数据集搜索", + "en": "Google Dataset Search" + }, + "description": { + "zh": "Google 提供的数据集搜索引擎,索引全网 2500万+ 数据集,支持跨平台发现", + "en": "Google's dataset search engine, indexing 25M+ datasets across the web for cross-platform discovery" + }, + "url": "https://datasetsearch.research.google.com/", + "authority_level": "industry", + "authority_justification": { + "zh": "Google Research 产品,基于 schema.org/Dataset 标准,覆盖主流数据平台", + "en": "Google Research product, based on schema.org/Dataset standard, covers major data platforms" + }, + "data_type": ["search_engine", "datasets", "metadata"], + "coverage": { + "geographic": "global", + "temporal": "2018-present", + "indicators": "25,000,000+ indexed datasets" + }, + "update_frequency": "continuous", + "access_method": { + "web": "https://datasetsearch.research.google.com/" + }, + "license": "N/A (search engine)", + "languages": ["en", "zh", "multilingual"], + "tags": ["search-engine", "datasets", "discovery", "metadata", "cross-platform"], + "verified": true, + "last_verified": "2026-02-02" +} diff --git a/firstdata/sources/academic/ai-ml/huggingface-datasets.json b/firstdata/sources/academic/ai-ml/huggingface-datasets.json new file mode 100644 index 0000000..7175f4d --- /dev/null +++ b/firstdata/sources/academic/ai-ml/huggingface-datasets.json @@ -0,0 +1,34 @@ +{ + "id": "huggingface-datasets", + "name": { + "zh": "Hugging Face Datasets", + "en": "Hugging Face Datasets" + }, + "description": { + "zh": "全球最大的开源机器学习数据集平台,托管超过100,000个数据集,涵盖NLP、计算机视觉、音频、多模态等领域", + "en": "The world's largest open-source ML dataset platform, hosting 100,000+ datasets covering NLP, computer vision, audio, and multimodal domains" + }, + "url": "https://huggingface.co/datasets", + "authority_level": "industry", + "authority_justification": { + "zh": "Hugging Face 是 AI/ML 社区的事实标准平台,获得超过 $400M 融资,与 Google、Amazon、Microsoft 等合作", + "en": "Hugging Face is the de facto standard platform for AI/ML community, raised $400M+, partnered with Google, Amazon, Microsoft" + }, + "data_type": ["datasets", "model_training_data", "benchmarks"], + "coverage": { + "geographic": "global", + "temporal": "2020-present", + "indicators": "100,000+ datasets" + }, + "update_frequency": "continuous", + "access_method": { + "web": "https://huggingface.co/datasets", + "api": "https://huggingface.co/docs/datasets/", + "python": "pip install datasets; from datasets import load_dataset" + }, + "license": "varies by dataset", + "languages": ["en", "zh", "multilingual"], + "tags": ["machine-learning", "nlp", "computer-vision", "audio", "multimodal", "open-source"], + "verified": true, + "last_verified": "2026-02-02" +} diff --git a/firstdata/sources/academic/ai-ml/kaggle-datasets.json b/firstdata/sources/academic/ai-ml/kaggle-datasets.json new file mode 100644 index 0000000..c456d42 --- /dev/null +++ b/firstdata/sources/academic/ai-ml/kaggle-datasets.json @@ -0,0 +1,34 @@ +{ + "id": "kaggle-datasets", + "name": { + "zh": "Kaggle Datasets", + "en": "Kaggle Datasets" + }, + "description": { + "zh": "全球最大的数据科学竞赛平台,托管 200,000+ 公开数据集,涵盖各行业和研究领域", + "en": "World's largest data science competition platform, hosting 200,000+ public datasets across industries and research domains" + }, + "url": "https://www.kaggle.com/datasets", + "authority_level": "industry", + "authority_justification": { + "zh": "Google 旗下平台,1500万+ 注册用户,数据科学社区事实标准", + "en": "Owned by Google, 15M+ registered users, de facto standard for data science community" + }, + "data_type": ["datasets", "competitions", "notebooks", "models"], + "coverage": { + "geographic": "global", + "temporal": "2010-present", + "indicators": "200,000+ datasets, 50,000+ notebooks" + }, + "update_frequency": "continuous", + "access_method": { + "web": "https://www.kaggle.com/datasets", + "api": "https://www.kaggle.com/docs/api", + "cli": "pip install kaggle; kaggle datasets download" + }, + "license": "varies by dataset", + "languages": ["en"], + "tags": ["machine-learning", "data-science", "competitions", "notebooks", "tabular"], + "verified": true, + "last_verified": "2026-02-02" +} diff --git a/firstdata/sources/academic/ai-ml/microsoft-research-open-data.json b/firstdata/sources/academic/ai-ml/microsoft-research-open-data.json new file mode 100644 index 0000000..7dd2482 --- /dev/null +++ b/firstdata/sources/academic/ai-ml/microsoft-research-open-data.json @@ -0,0 +1,33 @@ +{ + "id": "microsoft-research-open-data", + "name": { + "zh": "微软研究院开放数据", + "en": "Microsoft Research Open Data" + }, + "description": { + "zh": "微软研究院发布的研究数据集,涵盖 NLP、计算机视觉、社交网络等领域", + "en": "Research datasets released by Microsoft Research, covering NLP, computer vision, social networks and more" + }, + "url": "https://msropendata.com/", + "authority_level": "industry", + "authority_justification": { + "zh": "微软研究院官方发布,包含多个被广泛引用的基准数据集", + "en": "Officially released by Microsoft Research, includes widely-cited benchmark datasets" + }, + "data_type": ["datasets", "nlp", "computer_vision", "social_networks"], + "coverage": { + "geographic": "global", + "temporal": "2018-present", + "indicators": "100+ datasets" + }, + "update_frequency": "quarterly", + "access_method": { + "web": "https://msropendata.com/", + "azure": "Azure Blob Storage" + }, + "license": "varies by dataset (mostly research use)", + "languages": ["en"], + "tags": ["nlp", "computer-vision", "social-networks", "research", "microsoft"], + "verified": true, + "last_verified": "2026-02-02" +} diff --git a/firstdata/sources/academic/ai-ml/mlcommons.json b/firstdata/sources/academic/ai-ml/mlcommons.json new file mode 100644 index 0000000..705dc7e --- /dev/null +++ b/firstdata/sources/academic/ai-ml/mlcommons.json @@ -0,0 +1,33 @@ +{ + "id": "mlcommons", + "name": { + "zh": "MLCommons", + "en": "MLCommons" + }, + "description": { + "zh": "ML 行业联盟,制定 MLPerf 基准标准,发布 People's Speech、Multilingual Librispeech 等大规模数据集", + "en": "ML industry consortium, defining MLPerf benchmark standards, releasing large-scale datasets like People's Speech and Multilingual Librispeech" + }, + "url": "https://mlcommons.org/", + "authority_level": "industry", + "authority_justification": { + "zh": "由 Google、NVIDIA、Intel、Meta 等巨头联合成立,MLPerf 是 AI 硬件性能的行业标准", + "en": "Founded by Google, NVIDIA, Intel, Meta, etc. MLPerf is the industry standard for AI hardware performance" + }, + "data_type": ["benchmarks", "datasets", "standards"], + "coverage": { + "geographic": "global", + "temporal": "2018-present", + "indicators": "10+ datasets, 5+ benchmark suites" + }, + "update_frequency": "semi-annual", + "access_method": { + "web": "https://mlcommons.org/datasets/", + "github": "https://github.com/mlcommons" + }, + "license": "CC BY 4.0 / Apache 2.0", + "languages": ["en"], + "tags": ["benchmarks", "mlperf", "hardware", "inference", "training", "speech"], + "verified": true, + "last_verified": "2026-02-02" +} diff --git a/firstdata/sources/academic/ai-ml/openml.json b/firstdata/sources/academic/ai-ml/openml.json new file mode 100644 index 0000000..4fb8a93 --- /dev/null +++ b/firstdata/sources/academic/ai-ml/openml.json @@ -0,0 +1,34 @@ +{ + "id": "openml", + "name": { + "zh": "OpenML", + "en": "OpenML" + }, + "description": { + "zh": "开放机器学习平台,提供标准化的数据集、任务、流程和实验结果共享,促进可重复研究", + "en": "Open machine learning platform providing standardized sharing of datasets, tasks, flows and experimental results for reproducible research" + }, + "url": "https://www.openml.org", + "authority_level": "research", + "authority_justification": { + "zh": "TU Eindhoven 等多所大学联合开发,获得欧盟 Horizon 2020 资助,被 scikit-learn 官方集成", + "en": "Developed by TU Eindhoven and multiple universities, funded by EU Horizon 2020, officially integrated with scikit-learn" + }, + "data_type": ["datasets", "benchmarks", "experiments", "ml_pipelines"], + "coverage": { + "geographic": "global", + "temporal": "2013-present", + "indicators": "5,000+ datasets, 20,000+ tasks, 10M+ experiment runs" + }, + "update_frequency": "continuous", + "access_method": { + "web": "https://www.openml.org", + "api": "https://www.openml.org/apis", + "python": "pip install openml" + }, + "license": "CC BY 4.0", + "languages": ["en"], + "tags": ["machine-learning", "benchmarks", "reproducibility", "automl", "datasets"], + "verified": true, + "last_verified": "2026-02-02" +} diff --git a/firstdata/sources/academic/ai-ml/papers-with-code.json b/firstdata/sources/academic/ai-ml/papers-with-code.json new file mode 100644 index 0000000..b79d9ee --- /dev/null +++ b/firstdata/sources/academic/ai-ml/papers-with-code.json @@ -0,0 +1,33 @@ +{ + "id": "papers-with-code", + "name": { + "zh": "Papers With Code", + "en": "Papers With Code" + }, + "description": { + "zh": "机器学习论文、代码、数据集和基准的综合平台,追踪各任务的 SOTA 模型和结果", + "en": "Comprehensive platform for ML papers, code, datasets and benchmarks, tracking SOTA models and results across tasks" + }, + "url": "https://paperswithcode.com", + "authority_level": "research", + "authority_justification": { + "zh": "被 Meta AI 收购,是 AI 研究社区追踪最新进展的标准参考", + "en": "Acquired by Meta AI, the standard reference for tracking ML research progress in the AI community" + }, + "data_type": ["papers", "code", "datasets", "benchmarks", "sota_results"], + "coverage": { + "geographic": "global", + "temporal": "2012-present", + "indicators": "300,000+ papers, 6,000+ benchmarks, 8,000+ datasets" + }, + "update_frequency": "daily", + "access_method": { + "web": "https://paperswithcode.com", + "api": "https://paperswithcode.com/api/v1/docs/" + }, + "license": "CC BY-SA 4.0", + "languages": ["en"], + "tags": ["machine-learning", "deep-learning", "benchmarks", "sota", "research"], + "verified": true, + "last_verified": "2026-02-02" +} diff --git a/firstdata/sources/academic/ai-ml/uci-ml-repository.json b/firstdata/sources/academic/ai-ml/uci-ml-repository.json new file mode 100644 index 0000000..4403513 --- /dev/null +++ b/firstdata/sources/academic/ai-ml/uci-ml-repository.json @@ -0,0 +1,34 @@ +{ + "id": "uci-ml-repository", + "name": { + "zh": "UCI 机器学习库", + "en": "UCI Machine Learning Repository" + }, + "description": { + "zh": "加州大学欧文分校维护的经典机器学习数据集库,ML研究的黄金标准基准数据集", + "en": "Classic ML dataset repository maintained by UC Irvine, gold standard benchmark datasets for ML research" + }, + "url": "https://archive.ics.uci.edu/", + "authority_level": "research", + "authority_justification": { + "zh": "UC Irvine 维护 30+ 年,被 10万+ 论文引用,ML 领域最权威的基准数据集来源", + "en": "Maintained by UC Irvine for 30+ years, cited by 100,000+ papers, most authoritative benchmark source in ML" + }, + "data_type": ["datasets", "benchmarks"], + "coverage": { + "geographic": "global", + "temporal": "1987-present", + "indicators": "600+ datasets" + }, + "update_frequency": "monthly", + "access_method": { + "web": "https://archive.ics.uci.edu/", + "api": "https://archive.ics.uci.edu/ml/machine-learning-databases/", + "python": "pip install ucimlrepo; from ucimlrepo import fetch_ucirepo" + }, + "license": "CC BY 4.0", + "languages": ["en"], + "tags": ["machine-learning", "benchmarks", "classification", "regression", "clustering"], + "verified": true, + "last_verified": "2026-02-02" +} diff --git a/firstdata/sources/academic/ai-ml/zenodo-ml.json b/firstdata/sources/academic/ai-ml/zenodo-ml.json new file mode 100644 index 0000000..5656842 --- /dev/null +++ b/firstdata/sources/academic/ai-ml/zenodo-ml.json @@ -0,0 +1,34 @@ +{ + "id": "zenodo-ml", + "name": { + "zh": "Zenodo (机器学习社区)", + "en": "Zenodo (Machine Learning Community)" + }, + "description": { + "zh": "CERN 运营的开放研究数据仓库,为 ML 研究提供永久 DOI 和数据存档服务", + "en": "Open research data repository operated by CERN, providing permanent DOI and data archival for ML research" + }, + "url": "https://zenodo.org/communities/ml/", + "authority_level": "research", + "authority_justification": { + "zh": "CERN 和 OpenAIRE 联合运营,欧盟资助,是学术数据存档的标准平台", + "en": "Operated by CERN and OpenAIRE, EU funded, standard platform for academic data archival" + }, + "data_type": ["datasets", "code", "papers", "models"], + "coverage": { + "geographic": "global", + "temporal": "2013-present", + "indicators": "3,000,000+ records total, 50,000+ ML related" + }, + "update_frequency": "continuous", + "access_method": { + "web": "https://zenodo.org/", + "api": "https://developers.zenodo.org/", + "doi": "每个数据集有永久 DOI" + }, + "license": "varies by record", + "languages": ["en"], + "tags": ["open-science", "doi", "archival", "reproducibility", "cern"], + "verified": true, + "last_verified": "2026-02-02" +} diff --git a/firstdata/sources/academic/data-science/mit-ml-data-guide.json b/firstdata/sources/academic/data-science/mit-ml-data-guide.json new file mode 100644 index 0000000..05461d4 --- /dev/null +++ b/firstdata/sources/academic/data-science/mit-ml-data-guide.json @@ -0,0 +1,59 @@ +{ + "id": "mit-ml-data-guide", + "name": { + "en": "MIT Libraries ML/AI Data Guide", + "zh": "MIT图书馆机器学习/AI数据指南" + }, + "description": { + "en": "Curated guide from MIT Libraries covering machine learning and artificial intelligence data sources. Maintained by MIT EECS librarians, providing authoritative listings of datasets, repositories, and resources for ML/AI research and development.", + "zh": "由MIT图书馆整理的机器学习和人工智能数据源指南。由MIT EECS图书馆员维护,提供用于ML/AI研究和开发的权威数据集、存储库和资源列表。" + }, + "website": "https://libraries.mit.edu", + "data_url": "https://libguides.mit.edu/eecs/mldata", + "api_url": null, + "country": "US", + "domains": [ + "artificial-intelligence", + "machine-learning", + "data-science", + "computer-science" + ], + "geographic_scope": "global", + "update_frequency": "irregular", + "tags": [ + "machine-learning", + "artificial-intelligence", + "datasets", + "deep-learning", + "neural-networks", + "computer-vision", + "nlp", + "MIT", + "academic", + "research-data", + "机器学习", + "人工智能", + "数据集" + ], + "data_content": { + "en": [ + "Curated ML/AI dataset repositories (UCI, Kaggle, etc.)", + "Computer vision and image datasets", + "Natural language processing corpora", + "Benchmark datasets for model evaluation", + "Domain-specific ML data sources", + "Dataset search engines and discovery tools", + "Data preparation and preprocessing resources" + ], + "zh": [ + "精选ML/AI数据集存储库(UCI、Kaggle等)", + "计算机视觉和图像数据集", + "自然语言处理语料库", + "模型评估基准数据集", + "特定领域ML数据源", + "数据集搜索引擎和发现工具", + "数据准备和预处理资源" + ] + }, + "authority_level": "research" +} diff --git a/firstdata/sources/countries/usa/data-gov.json b/firstdata/sources/countries/usa/data-gov.json new file mode 100644 index 0000000..f6dd0ab --- /dev/null +++ b/firstdata/sources/countries/usa/data-gov.json @@ -0,0 +1,71 @@ +{ + "id": "us-data-gov", + "name": { + "en": "Data.gov - U.S. Government Open Data", + "zh": "Data.gov - 美国政府开放数据门户" + }, + "description": { + "en": "The official open data portal of the United States federal government. Provides access to over 300,000 datasets from federal agencies covering agriculture, climate, education, energy, finance, health, public safety, science, and more. Managed by the U.S. General Services Administration (GSA).", + "zh": "美国联邦政府官方开放数据门户。提供来自联邦机构的超过30万个数据集,涵盖农业、气候、教育、能源、金融、健康、公共安全、科学等领域。由美国总务管理局(GSA)管理。" + }, + "website": "https://www.data.gov", + "data_url": "https://catalog.data.gov/dataset", + "api_url": "https://catalog.data.gov/api/3", + "country": "US", + "domains": [ + "government", + "economics", + "health", + "education", + "climate", + "energy", + "agriculture", + "transportation", + "public-safety" + ], + "geographic_scope": "national", + "update_frequency": "daily", + "tags": [ + "open-data", + "government-data", + "federal-data", + "USA", + "datasets", + "CKAN", + "api", + "public-records", + "transparency", + "美国政府", + "开放数据", + "联邦数据" + ], + "data_content": { + "en": [ + "Federal agency datasets (300,000+ datasets)", + "Agriculture and food supply data", + "Climate and weather datasets", + "Education statistics and metrics", + "Energy production and consumption data", + "Finance and economic indicators", + "Health and medical research data", + "Public safety and crime statistics", + "Science and research datasets", + "Transportation and infrastructure data", + "Geospatial and mapping data" + ], + "zh": [ + "联邦机构数据集(30万+数据集)", + "农业和食品供应数据", + "气候和天气数据集", + "教育统计和指标", + "能源生产和消费数据", + "金融和经济指标", + "健康和医学研究数据", + "公共安全和犯罪统计", + "科学和研究数据集", + "交通和基础设施数据", + "地理空间和地图数据" + ] + }, + "authority_level": "government" +} diff --git a/firstdata/sources/sectors/marketing/README.md b/firstdata/sources/sectors/marketing/README.md new file mode 100644 index 0000000..7b49b76 --- /dev/null +++ b/firstdata/sources/sectors/marketing/README.md @@ -0,0 +1,103 @@ +# 营销与广告 | Marketing & Advertising + +**总数**: 15+个数据源 +**已完成**: 8个 +**进度**: 53% + +--- + +## 📊 总体进度 + +``` +总目标: 15+ 个高质量营销数据源 +当前完成: 8 个 +完成度: █████░░░░░ 53% +``` + +--- + +## 📚 已收录数据源 + +### 📈 市场研究 (2个) + +#### Statista +- **文件**: [statista.json](statista.json) ⭐💎 +- **权威等级**: market +- **类型**: 统计数据、市场研究、信息图、报告 +- **涵盖**: 全球,170+行业,100万+统计数据 +- **更新频率**: 每日 +- **特色**: Fortune 500 广泛使用,中英文支持 + +#### eMarketer / Insider Intelligence +- **文件**: [emarketer.json](emarketer.json) ⭐💎 +- **权威等级**: market +- **类型**: 预测、市场份额、广告支出、用户增长 +- **涵盖**: 全球,数字广告/电商/社交 +- **更新频率**: 月度 +- **特色**: 数字广告预测行业标准,被主流媒体引用 + +### 🔍 SEO 与流量分析 (3个) + +#### Google Trends +- **文件**: [google-trends.json](google-trends.json) ⭐💎 +- **权威等级**: industry +- **类型**: 搜索趋势、关键词、地理分布、时间序列 +- **涵盖**: 全球 200+ 国家,2004-至今 +- **更新频率**: 实时 +- **特色**: Google 官方,免费,学术研究权威来源 + +#### SimilarWeb +- **文件**: [similarweb.json](similarweb.json) ⭐💎 +- **权威等级**: market +- **类型**: 网站流量、排名、受众、竞品分析 +- **涵盖**: 全球,1亿+ 网站追踪 +- **更新频率**: 每日 +- **特色**: 上市公司,被 Google/Adobe 使用 + +#### SEMrush +- **文件**: [semrush.json](semrush.json) ⭐💎 +- **权威等级**: market +- **类型**: SEO、关键词、外链、广告、内容 +- **涵盖**: 全球,250亿+ 关键词,8亿+ 域名 +- **更新频率**: 每日 +- **特色**: 上市公司,1000万+ 用户 + +### 📺 媒体测量 (2个) + +#### Nielsen +- **文件**: [nielsen.json](nielsen.json) ⭐💎 +- **权威等级**: market +- **类型**: 电视收视率、广告效果、消费者行为 +- **涵盖**: 全球 100+ 国家,1923-至今 +- **更新频率**: 每日 +- **特色**: 电视收视率行业标准,百年历史 + +#### Comscore +- **文件**: [comscore.json](comscore.json) ⭐💎 +- **权威等级**: market +- **类型**: 数字受众、视频、广告、跨平台测量 +- **涵盖**: 全球 75+ 国家 +- **更新频率**: 月度 +- **特色**: MRC认证,跨屏测量标准 + +### 🚀 营销自动化 (1个) + +#### HubSpot Research +- **文件**: [hubspot-research.json](hubspot-research.json) ⭐💎 +- **权威等级**: industry +- **类型**: 报告、基准、统计、调研 +- **涵盖**: 全球,营销/销售/CRM +- **更新频率**: 年度 +- **特色**: 20万+ 客户一手数据,State of Marketing 报告 + +--- + +## 🎯 待添加 + +- [ ] Meta Business Suite Insights +- [ ] Twitter/X Analytics +- [ ] LinkedIn Marketing Solutions +- [ ] TikTok Business Center +- [ ] App Annie / data.ai +- [ ] Sensor Tower +- [ ] Kantar diff --git a/firstdata/sources/sectors/marketing/comscore.json b/firstdata/sources/sectors/marketing/comscore.json new file mode 100644 index 0000000..11e6ecf --- /dev/null +++ b/firstdata/sources/sectors/marketing/comscore.json @@ -0,0 +1,33 @@ +{ + "id": "comscore", + "name": { + "zh": "Comscore", + "en": "Comscore" + }, + "description": { + "zh": "跨平台媒体测量公司,提供数字广告、视频、电视跨屏观众测量数据", + "en": "Cross-platform media measurement company providing digital advertising, video, TV cross-screen audience measurement" + }, + "url": "https://www.comscore.com/", + "authority_level": "market", + "authority_justification": { + "zh": "上市公司(NASDAQ: SCOR),美国 MRC 认证的数字测量标准,被顶级媒体公司采用", + "en": "Public company (NASDAQ: SCOR), MRC-accredited digital measurement standard, adopted by top media companies" + }, + "data_type": ["digital_audience", "video", "advertising", "cross_platform"], + "coverage": { + "geographic": "global (75+ countries)", + "temporal": "1999-present", + "indicators": "digital, streaming, TV, movies" + }, + "update_frequency": "monthly", + "access_method": { + "web": "https://www.comscore.com/", + "reports": "Comscore Insights" + }, + "license": "subscription", + "languages": ["en"], + "tags": ["digital-audience", "video", "streaming", "advertising", "cross-platform"], + "verified": true, + "last_verified": "2026-02-02" +} diff --git a/firstdata/sources/sectors/marketing/emarketer.json b/firstdata/sources/sectors/marketing/emarketer.json new file mode 100644 index 0000000..2f66b43 --- /dev/null +++ b/firstdata/sources/sectors/marketing/emarketer.json @@ -0,0 +1,33 @@ +{ + "id": "emarketer", + "name": { + "zh": "eMarketer / Insider Intelligence", + "en": "eMarketer / Insider Intelligence" + }, + "description": { + "zh": "数字营销和电商行业研究机构,提供广告支出预测、用户增长、平台份额等数据", + "en": "Digital marketing and e-commerce research firm providing ad spending forecasts, user growth, platform market share data" + }, + "url": "https://www.insiderintelligence.com/", + "authority_level": "market", + "authority_justification": { + "zh": "Axel Springer 旗下,数字广告预测的行业标准,被 WSJ、NYT 等主流媒体引用", + "en": "Owned by Axel Springer, industry standard for digital ad forecasts, cited by WSJ, NYT and major media" + }, + "data_type": ["forecasts", "market_share", "ad_spending", "user_growth"], + "coverage": { + "geographic": "global (focus US, China, EU)", + "temporal": "1996-present", + "indicators": "digital ads, e-commerce, social, mobile" + }, + "update_frequency": "monthly", + "access_method": { + "web": "https://www.insiderintelligence.com/", + "reports": "Insider Intelligence Reports" + }, + "license": "subscription (部分图表免费)", + "languages": ["en"], + "tags": ["digital-advertising", "e-commerce", "social-media", "forecasts", "market-share"], + "verified": true, + "last_verified": "2026-02-02" +} diff --git a/firstdata/sources/sectors/marketing/google-trends.json b/firstdata/sources/sectors/marketing/google-trends.json new file mode 100644 index 0000000..15491d8 --- /dev/null +++ b/firstdata/sources/sectors/marketing/google-trends.json @@ -0,0 +1,34 @@ +{ + "id": "google-trends", + "name": { + "zh": "Google Trends", + "en": "Google Trends" + }, + "description": { + "zh": "Google 搜索趋势数据,提供关键词热度、地区分布、相关查询、实时趋势", + "en": "Google search trends data providing keyword popularity, geographic distribution, related queries, real-time trends" + }, + "url": "https://trends.google.com/", + "authority_level": "industry", + "authority_justification": { + "zh": "Google 官方产品,基于全球最大搜索引擎的真实搜索数据,学术研究和商业分析的权威来源", + "en": "Official Google product based on real search data from world's largest search engine, authoritative source for research and business analysis" + }, + "data_type": ["search_trends", "keywords", "geographic", "time_series"], + "coverage": { + "geographic": "global (200+ countries)", + "temporal": "2004-present", + "indicators": "unlimited keywords" + }, + "update_frequency": "real-time", + "access_method": { + "web": "https://trends.google.com/", + "api": "unofficial (pytrends)", + "export": "CSV download" + }, + "license": "free", + "languages": ["en", "zh", "multilingual"], + "tags": ["search-trends", "keywords", "consumer-interest", "market-research", "real-time"], + "verified": true, + "last_verified": "2026-02-02" +} diff --git a/firstdata/sources/sectors/marketing/hubspot-research.json b/firstdata/sources/sectors/marketing/hubspot-research.json new file mode 100644 index 0000000..e0cfc62 --- /dev/null +++ b/firstdata/sources/sectors/marketing/hubspot-research.json @@ -0,0 +1,33 @@ +{ + "id": "hubspot-research", + "name": { + "zh": "HubSpot Research", + "en": "HubSpot Research" + }, + "description": { + "zh": "HubSpot 发布的营销、销售、客户服务行业报告和基准数据,包含 State of Marketing 等年度报告", + "en": "Marketing, sales, customer service industry reports and benchmarks by HubSpot, including annual State of Marketing reports" + }, + "url": "https://www.hubspot.com/marketing-statistics", + "authority_level": "industry", + "authority_justification": { + "zh": "上市公司(NYSE: HUBS),全球领先的营销自动化平台,拥有 20万+ 客户的一手数据", + "en": "Public company (NYSE: HUBS), leading marketing automation platform with first-party data from 200K+ customers" + }, + "data_type": ["reports", "benchmarks", "statistics", "surveys"], + "coverage": { + "geographic": "global", + "temporal": "2006-present", + "indicators": "marketing, sales, CRM, content" + }, + "update_frequency": "annual", + "access_method": { + "web": "https://www.hubspot.com/marketing-statistics", + "reports": "free download with registration" + }, + "license": "free", + "languages": ["en"], + "tags": ["marketing-automation", "inbound-marketing", "sales", "crm", "benchmarks"], + "verified": true, + "last_verified": "2026-02-02" +} diff --git a/firstdata/sources/sectors/marketing/nielsen.json b/firstdata/sources/sectors/marketing/nielsen.json new file mode 100644 index 0000000..b8593ca --- /dev/null +++ b/firstdata/sources/sectors/marketing/nielsen.json @@ -0,0 +1,33 @@ +{ + "id": "nielsen", + "name": { + "zh": "Nielsen", + "en": "Nielsen" + }, + "description": { + "zh": "全球领先的媒体和消费者洞察公司,提供电视收视率、广告效果、消费者行为数据", + "en": "Global leader in media and consumer insights, providing TV ratings, advertising effectiveness, consumer behavior data" + }, + "url": "https://www.nielsen.com/", + "authority_level": "market", + "authority_justification": { + "zh": "成立于 1923 年,电视收视率的行业标准制定者,被全球广告主和媒体公司采用", + "en": "Founded in 1923, industry standard setter for TV ratings, adopted by global advertisers and media companies" + }, + "data_type": ["tv_ratings", "advertising", "consumer_behavior", "retail"], + "coverage": { + "geographic": "global (100+ countries)", + "temporal": "1923-present", + "indicators": "TV, streaming, audio, retail panels" + }, + "update_frequency": "daily", + "access_method": { + "web": "https://www.nielsen.com/", + "reports": "Nielsen Insights" + }, + "license": "subscription (部分报告免费)", + "languages": ["en", "zh"], + "tags": ["tv-ratings", "media", "advertising", "consumer", "retail", "audience"], + "verified": true, + "last_verified": "2026-02-02" +} diff --git a/firstdata/sources/sectors/marketing/semrush.json b/firstdata/sources/sectors/marketing/semrush.json new file mode 100644 index 0000000..6158f00 --- /dev/null +++ b/firstdata/sources/sectors/marketing/semrush.json @@ -0,0 +1,33 @@ +{ + "id": "semrush", + "name": { + "zh": "SEMrush", + "en": "SEMrush" + }, + "description": { + "zh": "全球领先的 SEO 和数字营销工具,提供关键词研究、竞品分析、广告情报、内容营销数据", + "en": "Leading global SEO and digital marketing tool providing keyword research, competitor analysis, advertising intelligence, content marketing data" + }, + "url": "https://www.semrush.com/", + "authority_level": "market", + "authority_justification": { + "zh": "上市公司(NYSE: SEMR),全球 1000 万+ 用户,被 Forbes、IBM 等使用", + "en": "Public company (NYSE: SEMR), 10M+ users globally, used by Forbes, IBM, etc." + }, + "data_type": ["seo", "keywords", "backlinks", "advertising", "content"], + "coverage": { + "geographic": "global", + "temporal": "2008-present", + "indicators": "25B+ keywords, 800M+ domains" + }, + "update_frequency": "daily", + "access_method": { + "web": "https://www.semrush.com/", + "api": "https://developer.semrush.com/" + }, + "license": "subscription", + "languages": ["en"], + "tags": ["seo", "keywords", "backlinks", "ppc", "content-marketing", "competitors"], + "verified": true, + "last_verified": "2026-02-02" +} diff --git a/firstdata/sources/sectors/marketing/similarweb.json b/firstdata/sources/sectors/marketing/similarweb.json new file mode 100644 index 0000000..ae49ca9 --- /dev/null +++ b/firstdata/sources/sectors/marketing/similarweb.json @@ -0,0 +1,33 @@ +{ + "id": "similarweb", + "name": { + "zh": "SimilarWeb", + "en": "SimilarWeb" + }, + "description": { + "zh": "网站流量和数字营销情报平台,提供网站排名、流量来源、受众分析、竞品对比", + "en": "Website traffic and digital marketing intelligence platform providing rankings, traffic sources, audience analysis, competitor benchmarking" + }, + "url": "https://www.similarweb.com/", + "authority_level": "market", + "authority_justification": { + "zh": "上市公司(NYSE: SMWB),被 Google、Adobe、Walmart 等使用,全球数字营销情报标准", + "en": "Public company (NYSE: SMWB), used by Google, Adobe, Walmart, global standard for digital marketing intelligence" + }, + "data_type": ["web_traffic", "rankings", "audience", "competitors"], + "coverage": { + "geographic": "global", + "temporal": "2007-present", + "indicators": "100M+ websites tracked" + }, + "update_frequency": "daily", + "access_method": { + "web": "https://www.similarweb.com/", + "api": "https://developers.similarweb.com/" + }, + "license": "freemium (部分免费)", + "languages": ["en"], + "tags": ["web-traffic", "seo", "digital-marketing", "competitors", "audience"], + "verified": true, + "last_verified": "2026-02-02" +} diff --git a/firstdata/sources/sectors/marketing/statista.json b/firstdata/sources/sectors/marketing/statista.json new file mode 100644 index 0000000..4f323c8 --- /dev/null +++ b/firstdata/sources/sectors/marketing/statista.json @@ -0,0 +1,33 @@ +{ + "id": "statista", + "name": { + "zh": "Statista", + "en": "Statista" + }, + "description": { + "zh": "全球领先的商业数据平台,提供市场规模、消费者行为、行业趋势等统计数据,覆盖 170+ 行业", + "en": "Leading global business data platform providing market size, consumer behavior, industry trends across 170+ industries" + }, + "url": "https://www.statista.com/", + "authority_level": "market", + "authority_justification": { + "zh": "被 Fortune 500 公司、顶级咨询公司和学术机构广泛引用,数据来源包括政府、行业协会和市场研究", + "en": "Widely cited by Fortune 500, top consulting firms and academia, data sourced from governments, industry associations and market research" + }, + "data_type": ["statistics", "market_research", "infographics", "reports"], + "coverage": { + "geographic": "global", + "temporal": "1999-present", + "indicators": "1,000,000+ statistics, 80,000+ topics" + }, + "update_frequency": "daily", + "access_method": { + "web": "https://www.statista.com/", + "api": "available for enterprise" + }, + "license": "subscription (部分免费)", + "languages": ["en", "de", "es", "fr", "zh"], + "tags": ["market-research", "statistics", "consumer", "industry", "trends"], + "verified": true, + "last_verified": "2026-02-02" +}