一、基本流程
1、采集已有的公众号文章:
首先用任意微信登陆微信pc端(要下载微信pc端并安装好)
把你做的采集入口(如:http://xxx.com/api/proxy/begincollect)发送到微信上
点击直接在微信PC端直接打开即可
2、如果公众号不存在:
做一个登记对应的公众号文章,一篇即可
做一个自动客户端,当有新文章时,自动导航访问,fidder监测到后,会自动推送到后台生成“公众号记录”
二、准备工作
1、fidder开启抓取 https 支持:Tools-》Options
2、自动解码
3、配置过滤
三、编写抓取脚本
1、在 Fidder 编写抓取脚本:Rules-》Customize Rules
static var tagUrl = "&abc=";
static var begincollectHost = "web.test.com";//替换成你的服务器
static var begincollectUrl = "/api/proxy/begincollect"; //替换成你的等待页面入口地址
static var host = "localhost:33386";//你的api服务主机地址
static var apiUrl = "/api/proxy/weixin";//你的api服务地址
static var debug = false;
static function httpPost(url: String,host: String,contentStr: String): String{
var content: byte[] = System.Text.Encoding.UTF8.GetBytes(contentStr);
var oRQH: HTTPRequestHeaders = new HTTPRequestHeaders(url, ['Host: '+host,
'Content-Length: '+content.length.ToString(), 'Content-Type: application/x-www-url-encoded']);
oRQH.HTTPMethod = "POST";
var oSD = new System.Collections.Specialized.StringDictionary();
var newSession = FiddlerApplication.oProxy.SendRequestAndWait(oRQH, content, oSD, null);
var jsonString = newSession.GetResponseBodyAsString();
return jsonString;
}
static function sendMsg(contentStr: String,type: String) : Object {
var jsonString = httpPost(apiUrl+"?type="+type,host,contentStr);
FiddlerApplication.Log.LogString("result:"+jsonString);
return Fiddler.WebFormats.JSON.JsonDecode(jsonString);
}
static function getFullUrl(url:String){
if(debug){
var end = "";
if (url.IndexOf('#') > 0)
{
end = url.Substring(url.IndexOf('#'));
url = url.Substring(0, url.IndexOf('#'));
}
url = url + (url.IndexOf('?') > 0 ? "" : "?a=") + tagUrl + end;
}
return "https://mp.weixin.qq.com/"+url;
}
static function getRndInternal(){
return new System.Random().Next(3, 11) * 1000;
}
static function getReloadScript(url:String){
return getReloadScript(url,0);
}
static function getReloadScript(url:String,time:int){
if(time==0)
time = getRndInternal();
var script = " <script>setTimeout(function(){window.location.href='"+url+"'},"+time+");</script>";
FiddlerApplication.Log.LogString("reloadscript:"+script);
return script;
}
static function getMPHisUrl(biz:String){ //获取公众号历史记录url
return getFullUrl("mp/profile_ext?action=home&__biz="+biz+"&scene=124#wechat_redirect");
}
static function getMPhisReloadScript(biz:String){
var url = getMPHisUrl(biz);
return getReloadScript(url);
}
static function getMsgHisUrl(biz:String,pass_ticket:String,offset:String){ //获取公众号历史记录api url
return getFullUrl("/mp/profile_ext?action=getmsg&__biz="+biz+"&f=json&offset="+
offset+"&count=10&is_ok=1&scene=124&pass_ticket="+pass_ticket
+"&x5=0&f=json");
}
static function OnBeforeResponse(oSession: Session) {
if (m_Hide304s && oSession.responseCode == 304) {
oSession["ui-hide"] = "true";
}
if(debug && !oSession.uriContains(tagUrl))
return;
if(oSession.HostnameIs(begincollectHost) && oSession.uriContains(begincollectUrl)){ //开始采集入口,地址要通过微信pc端浏览器打开
var reloadScript="";
var responses = oSession.GetResponseBodyAsString();
var url="";
var collect_url = "http://"+begincollectHost+begincollectUrl+"?key="+tagUrl;
var time = 0;
if(System.DateTime.Now.Hour<9 || System.DateTime.Now.Hour>=21)
{ //21点之后,9点之前不采集
url = collect_url;
time = 3600 * 13 * 1000;
}
else
{
//获取公众号biz
var jsonObj = sendMsg("","4");
var biz = jsonObj.JSONObject["biz"];
if(biz!=undefined){
//跳转到公众号历史文章地址
url = getMPHisUrl(biz);
}
else{ //没有可采集的公众号,继续空页面轮询
time = 3600 * 1000 + getRndInternal();
url = collect_url;
}
}
reloadScript = getReloadScript(url, time);
//我的入口页面返回是json,如果是html,则不用下面这句
oSession.oResponse["Content-Type"]="text/html; charset=UTF-8";
oSession.utilSetResponseBody(responses+reloadScript);
return;
}
if(oSession.HostnameIs("mp.weixin.qq.com")){
var reloadScript="";
var responses = oSession.GetResponseBodyAsString();
oSession.utilDecodeResponse(); //解码
if(oSession.uriContains("profile_ext?action=home")){ //公众号历史消息页
sendMsg(responses,"1");//记录公众号信息
//获取公众号历史第1页记录
var url = oSession.fullUrl.Replace("action=home","action=getmsg")+"&x5=0&f=json&f=json&offset=0&count=10&is_ok=1";
reloadScript = getReloadScript(url);
}
else if(oSession.uriContains("profile_ext?action=getmsg")){ //获取历史消息
var reload = oSession.uriContains("&offset=0");
var content = (reload?"":oSession.url)+responses;
//保存文章记录
sendMsg(content,"2");
if(reload){ //再次获取文章记录,总共获取20条
//获取第2页10条记录
var url = oSession.fullUrl.Replace("&offset=0&","&offset=10&");
reloadScript = getReloadScript(url);
}
else
{ //返回轮询等待页面
var url = "http://"+begincollectHost+begincollectUrl+"?key="+tagUrl;
reloadScript = getReloadScript(url);
}
oSession.oResponse["Content-Type"]="text/html; charset=UTF-8";
}
else if(oSession.uriContains("/s/") || oSession.uriContains("s?__biz=")){//文章
//保存文章内容
var jsonObj = sendMsg(responses,"3");
return;
}
oSession.utilSetResponseBody(responses+reloadScript);
}
}
2、服务端脚本(这里以c#.net为例)
public class ProxyController : ApiController
{
[System.Web.Http.HttpGet]
[System.Web.Http.HttpPost]
public JsonResult<object> BeginCollect()
{
return Json<object>("Collect,现在时间:" + DateTime.Now.ToString());
}
// GET: Proxy
public async Task<JsonResult<object>> weixin(int type)
{
//获取从Fidder推送过来的内容
string content = await Request.Content.ReadAsStringAsync();
object obj = string.Empty;
string biz = "";
if (type == 1)
{ //从公众号历史页面获取并保存公众号信息
Func<string, string> getValue = (pattern) => {
return GetValue(content, pattern);
};
AddMsg(() => {
//获取biz
biz = getValue("var\\s*__biz\\s*=\\s*\".+\"");
//获取昵称
string nickName = getValue("var\\s*nickname\\s*=\\s*\".+\"");
//获取headimage
string headImg = getValue("var\\s*headimg\\s*=\\s*\".+\"");
//appid
string appid = getValue("appid\\s*:\\s*\".+\"");
string errMsg;
if (nickName.Length > 0 && biz.Length > 0)
{ //todo:保存数据到数据库
}
});
}
else if (type == 2)
{
if (!content.StartsWith("{"))
{ //url和response组合
int index = content.IndexOf('{');
string url = content.Substring(0, index);
string[] paramList = url.Split('&');
Func<string, string> getValue = (name) => paramList.First(item => item.StartsWith(name + "=")).Replace(name + "=", "");
biz = getValue("__biz");
content = content.Substring(index);
//obj = new { biz, uin, pass_ticket, key };
DataService.SetData("princess_updateflag", new { biz }, out string errMsg);
}
AddMsg(() => RecorData(content, biz));
}
else if (type==4)
{
string errMsg;
dynamic data = DataService.GetData<ExpandoObject>("princess_getbiz", out errMsg);
if (data != null)
{
biz = data.biz;
if (biz?.Length > 0)
obj = new { biz };
}
}
else if(type==3)
{
AddMsg(() => {
BuildPrincess(content);
});
}
return Json<object>(obj);
}
private void AddMsg(Action action)
{
MessageQueue.Add(new MessageQueueItem(() => {
try
{
action();
}
catch (Exception ex)
{
}
}));
}
private void BuildPrincess(string content)
{ //从文章信息里获取公众号信息
//string url = content.Substring(0, 3000);
string biz = GetValue(content, "var\\s*msg_link\\s*=\\s*\".+\"");
if (biz.Length == 0) return;
biz = biz.Substring(0, biz.IndexOf('&')).Substring(6);
biz = biz.Substring(biz.IndexOf("__biz=") + 6);
//content = content.Substring(1000);
//公众号名称
string source_name = GetValue(content, "var\\s*nickname\\s*=\\s*\".+\"");
string source_img_url = GetValue(content, "var\\s*ori_head_img_url\\s*=\\s*\".+\"");
string wechat_num = GetValue(content,
"\\<span\\s+class=\"profile_meta_value\"\\>(?<name>.+)\\</span\\>");
DataService.SetData("Princess_insert", new
{
org_id = biz,
source_name = source_name,
source_url = "",
source_img_url = source_img_url,
img_url = source_img_url,
biz = biz
}, out string errMsg);
}
private string GetValue(string value, string pattern)
{
if (Regex.IsMatch(value, pattern))
{
Match match = Regex.Match(value, pattern);
if (match.Groups.Count > 1)
return match.Groups[1].Value;
string result = match.Value;
if (result.IndexOf('\"') > 0)
{
result = result.Substring(result.IndexOf('\"') + 1);
result = result.Substring(0, result.IndexOf('\"'));
}
return result;
}
return "";
}
private void RecorData(string jsonData,string biz)
{
dynamic result = jsonData.ToObjectFromJson<ExpandoObject>();
if (result.ret == 0)
{
string general_msg_list = result.general_msg_list;
string errMsg;
dynamic data = general_msg_list.ToObjectFromJson<ExpandoObject>();
IEnumerable<dynamic> docs = (data.list as List<dynamic>).Where(item => {
if (!(item as IDictionary<string, object>).ContainsKey("app_msg_ext_info"))
return false;
return DataService.GetDataValue<int>("doc_exists",
out errMsg, new { articleid =
#34;{item.comm_msg_info.id}-{item.app_msg_ext_info.fileid}" }) == 0;
}).Select(item =>
{
item.app_msg_ext_info.create_date =
DateTimeHelper.GetDateTimeFromXml(item.comm_msg_info.datetime);
item.app_msg_ext_info.pid = item.comm_msg_info.id.ToString();
return item.app_msg_ext_info;
});
if (docs.Count() == 0) return;
string org_id = docs.First().content_url;
org_id = org_id.Substring(org_id.IndexOf("__biz=") + 6).Split('&')[0];
var paras = GetDatas(org_id, docs);
var subDocs = docs.Where(item => item.is_multi == 1)
.Select(item =>
{
IEnumerable<dynamic> multiDocs =
item.multi_app_msg_item_list as IEnumerable<dynamic>;
return GetDatas(org_id, multiDocs, item.create_date, #34;{item.pid}");
}
);
if (subDocs.Count() > 0)
{
List<object> list = paras.ToList();
foreach (var item in subDocs)
{
list.AddRange(item);
}
paras = list;
}
if (!DataService.SetData("doc_insert", paras, out errMsg))
{
}
}
}
/// <summary>
/// 上传图片到文件服务器
/// </summary>
/// <param name="picUrl"></param>
/// <returns></returns>
private string UploadFile(string picUrl)
{
dynamic picResult = DataService.Execute<ExpandoObject>("fileservice", new
{
keyword = "file",
content = new
{
ext = "jpg",
data = picUrl
}
});
return picResult.picurl;
}
/// <summary>
/// 获取要存储的数据对象
/// </summary>
/// <param name="org_id"></param>
/// <param name="docs"></param>
/// <param name="create_date"></param>
/// <param name="pid"></param>
/// <returns></returns>
private IEnumerable<object> GetDatas(string org_id, IEnumerable<dynamic> docs
, DateTime? create_date = null, string pid = null)
{
var paras = docs.Select(item => {
string imageUrl = item.cover;
imageUrl = UploadFile(imageUrl);
return new
{
articleid = #34;{pid ?? item.pid}-{item.fileid}",
title = item.title,
digest = item.digest,
ori_url = item.content_url,
url = item.content_url,
image_url = imageUrl,
ori_image_url = imageUrl,
doc_type = "图文",
create_date = create_date ?? item.create_date,
org_id = org_id
};
});
return paras;
}
}
免责声明及注意事项:
目前理论上可以无限循环自动抓取所有公众号文章,但由于微信限制,当同一天采集达到100个以上公众号时,该微信会被暂时屏蔽。如果需要采集大量公众号的文章,可以借助第三方代理,让对外“IP”按规则变化。
【此文章仅供学习使用,不能用于任何商业目的。任何个人或组织用于非法目的使用,赚取利益的,与作者无关。造成的一切后果,由使用者自己承担】