标签:
项目 一、项目需求:对搜索关键词进行类别的统计分析,为了后面的entity-rank做准备。
0,各种关键数据统计:
数据量:1个月数据:about 1000T。
1,对IE的所有浏览搜索的提取代码:
Scope:
//Script GUID:ad2766d3-7aec-4ffa-9bbd-ec2740361999 //Used for tracking history REFERENCE "/shares/searchWebLoad/RetroIndex/bin/ManagedUrlNormalizer.dll"; RESOURCE "/shares/searchWebLoad/RetroIndex/bin/NativeUrlNormalizer.dll"; UnifiedViewRaw = VIEW "/shares/searchDM/distrib/released/CompetitiveUnifiedView/CompetitiveUnifiedPageView.view" PARAMS ( Start = @"2016-07-26", End = @"2016-07-26", Source = @"All" // Source = @"DesktopIE" ); ClickData = SELECT Page_FromPage.Query, RetroIndex.UrlNormalizer.GetNormalizedStringNoThrow(Encoding.UTF8.GetBytes(Request_Url)) AS NormalizedUrl, COUNT() AS Count FROM UnifiedViewRaw WHERE Page_FromPage.IsQuery == true AND Page_FromPage.Vertical.ToLower() == "web" AND NOT Request_IsQuery AND Page_FromPage.Market.ToLower() == "zh-cn" HAVING Count >= 2 AND Query != "" AND NormalizedUrl != "" AND NormalizedUrl != null; // Page_FromPage.IsQuery: True if the page is a query page // Vertical: Search Vertical of this PageView // Request_IsQuery bool: True if this page view is search engine result page OUTPUT TO @"/my/entityranker/oneDay/IEqueryclickurlpairsAll.wsv";
C#:
using System; using System.Collections.Generic; using System.IO; using System.Text; using ScopeRuntime; public class CMyUtils { static public string NormalizeURL(string url) { url = url.ToLower(); if (url.StartsWith("http://")) { url = url.Substring("http://".Length); } else if (url.StartsWith("https://")) { url = url.Substring("https://".Length); } if (url.StartsWith("www.")) { url = url.Substring("www.".Length); } if (url.EndsWith("/")) { url = url.Substring(0, url.Length - 1); } return url; } static public string GetHost(string url) { url = NormalizeURL(url); int slashPosition = url.IndexOf(‘/‘); if (slashPosition >= 0) { url = url.Substring(0, slashPosition); } return url; } } public class TopReducer : Reducer { public override Schema Produces(string[] columns, string[] args, Schema input) { return input.Clone(); } public override IEnumerable<Row> Reduce(RowSet input, Row output, string[] args) { int count = 0; foreach (Row row in input.Rows) { if (++count <= 100) { row.Copy(output); yield return output; } } } }
2,对bing的所有浏览搜索的提取代码:
Scope:
//Script GUID:8e9ba2e3-8288-49e1-a5ff-7776d653ae16 //Used for tracking history REFERENCE "/local/IndexQualityCJK/wb/WordBreaker.dll"; RESOURCE "/local/IndexQualityCJK/wb/unzip.exe"; RESOURCE "/local/IndexQualityCJK/wb/wordbreak.zip"; REFERENCE @"/shares/searchDM/distrib/released/SLAPI/SearchLogApi.dll"; REFERENCE @"/shares/searchDM/distrib/released/SLAPI/Serializer.exe"; REFERENCE @"/shares/searchDM/distrib/released/SLAPI/Microsoft.Live.Json.dll"; REFERENCE "/shares/searchWebLoad/RetroIndex/bin/ManagedUrlNormalizer.dll"; RESOURCE "/shares/searchWebLoad/RetroIndex/bin/NativeUrlNormalizer.dll"; USING MS.Internal.Bing.DataMining.SearchLogApi; USING Microsoft.Live.Json; SlapiPageView = VIEW "/shares/searchDM/distrib/released/SLAPI/SearchLogPageView.view" //VIEW "/shares/searchDM/distrib/released/SLAPI/SearchLogSessionView.view" //PARAMS (Start = @"2013-09-01", End = @"2013-09-01", UseSample=false, Dataset="Mobile"); PARAMS (Start = "2016-07-26", End = "2016-07-26", UseSample=false, Dataset= "Mobile"); ZHCNTraffic = SELECT WordBreaker.BreakWords(Query_RawQuery, "zh-cn") AS Query, Request_RequestTime.ToString("yyyy-MM-dd") AS QDate, Page_Entities_WebResults FROM SlapiPageView WHERE Request_IsBotVNext == false AND Request_IsMarketingTraffic == false AND string.IsNullOrEmpty(Market) == false AND Market.ToLower() == "zh-cn" AND string.IsNullOrEmpty(Vertical) == false AND Vertical.ToLower() == "web"; //AND string.IsNullOrEmpty(FormCode) == false AND (FormCode.ToUpper() == "QBLH" OR FormCode.ToUpper() == "QBRE" OR FormCode.ToUpper() == "MSNBHP" OR FormCode.ToUpper() == "MSNFLH" OR FormCode.ToUpper() == "BCNASI"); ProcessWebEntity = PROCESS ZHCNTraffic USING FEXLogSimpleExtractor; ClickQueryUrlPairs = SELECT Query, RetroIndex.UrlNormalizer.GetNormalizedStringNoThrow(Encoding.UTF8.GetBytes(Url)) AS NormalizedUrl, COUNT() AS PairCount FROM ProcessWebEntity WHERE Click > 0 HAVING PairCount >= 2 AND Query != "" AND NormalizedUrl != "" AND NormalizedUrl != null; OUTPUT TO @"/my/entityranker/oneDay/bingqueryclickurlpairsTime.wsv";
C#:
using System; using System.Collections.Generic; using System.IO; using System.Text; using ScopeRuntime; using MS.Internal.Bing.DataMining.SearchLogApi; public class URLUtility { static public string NormalizeURL(string url) { url = url.ToLower(); if (url.StartsWith("http://")) { url = url.Substring("http://".Length); } else if (url.StartsWith("https://")) { url = url.Substring("https://".Length); } if (url.StartsWith("www.")) { url = url.Substring("www.".Length); } if (url.EndsWith("/")) { url = url.Substring(0, url.Length - 1); } return url; } static public string GetHost(string url) { url = NormalizeURL(url); int slashPosition = url.IndexOf(‘/‘); if (slashPosition >= 0) { url = url.Substring(0, slashPosition); } return url; } } public class FEXLogSimpleExtractor : Processor { public override Schema Produces(string[] columns, string[] args, Schema input) { return new Schema("Query:string,QueryDate:string,Url:string,Host:string,POS:int,Click:int"); } public override IEnumerable<Row> Process(RowSet input, Row output, string[] args) { foreach (Row row in input.Rows) { string Query = row["Query"].String; string QueryDate = row["QDate"].String; var WebEntities = row["Page_Entities_WebResults"].Value as MS.Internal.Bing.DataMining.SearchLogApi.WebResultList; for (int i = 0; i < WebEntities.Count;i++ ) { string Url = WebEntities[i].TitleUrl; string Host = URLUtility.GetHost(Url); int Pos = WebEntities[i].PositionOfEntityInTopLevelRegion; int Click = WebEntities[i].Clicks.Count; if (Pos == 1) { output["Query"].Set(Query); output["QueryDate"].Set(QueryDate); output["Url"].Set(Url); output["Host"].Set(Host); output["POS"].Set(Pos); output["Click"].Set(Click); yield return output; } } } } }
3,搜索查询和分类的提取代码:
//Script GUID:5b7abb8b-defd-4f2b-b703-9882ee6e960b //Used for tracking history REFERENCE @"/shares/searchWebLoad/RetroIndex/bin/RetroIndexProcessor.dll"; REFERENCE @"/shares/searchWebLoad/RetroIndex/bin/ManagedUrlNormalizer.dll"; RESOURCE @"/shares/searchWebLoad/RetroIndex/bin/NativeUrlNormalizer.dll"; USING RetroIndex; Snapshot = VIEW "/shares/searchWebLoad/RetroIndex/Views/LatestSnapshot.view" PARAMS ( Sample = false, TierFlag = 3 ); SELECT Url, Header, Body, HttpHeader, CodePage FROM Snapshot; Uberchunk = PROCESS PRODUCE Url, Country, Language, Category USING RetroIndexProcessor HAVING string.IsNullOrEmpty(Language) == false && Language.ToLower() == "zh_chs"; OUTPUT TO @"/local/IndexCJK/MobileTopSites/UrlCategory.wsv";
C#:
using System; using System.Collections.Generic; using System.IO; using System.Text; using ScopeRuntime; public class Utility { public static bool CJKVersionMobileFriendly(string category) { if (string.IsNullOrEmpty(category)) { return false; } string[] cates = category.Split(new string[] { "#TAB#" }, StringSplitOptions.RemoveEmptyEntries); foreach (string cate in cates) { if(cate.StartsWith("aa00") && (cate.EndsWith("Mobi") || cate.EndsWith("CrossDevice") || cate.EndsWith("MobileFriendly") || cate.EndsWith("MobileUnFriendly"))) { return true; } } return false; } public static bool CJKVersionMobileUnFriendly(string category) { if (string.IsNullOrEmpty(category)) { return false; } string[] cates = category.Split(new string[] { "#TAB#" }, StringSplitOptions.RemoveEmptyEntries); foreach (string cate in cates) { if (cate.StartsWith("aa00") && cate.EndsWith("MobileUnFriendly")) { return true; } } return false; } } public class CJKVersionMobileOkClassifierProcessor : Processor { public override Schema Produces(string[] columns, string[] args, Schema input) { return new Schema("Url:string, MobileClassifier:int"); } public override IEnumerable<Row> Process(RowSet input, Row output, string[] args) { foreach (Row row in input.Rows) { string Url = row["Url"].String; string Language = row["Language"].String; if (!(string.IsNullOrEmpty(Url) == false && string.IsNullOrEmpty(Language) == false && Language.ToLower() == "zh_chs")) { continue; } //classifier features string Category = row["Category"].String; string DUPipeline_MobileUrls = row["DUPipeline_MobileUrls"].String; string DUPipeline_ResponsiveDesignSpans = row["DUPipeline_ResponsiveDesignSpans"].String; string DUV2_MobileUrl = row["DUV2_MobileUrl"].String; string InjHdr_MobileOkClassifier_v1 = row["InjHdr_MobileOkClassifier_v1"].String; string InjHdr_MobileOkX_v1 = row["InjHdr_MobileOkX_v1"].String; string InjHdr_MobileRedirect_V1 = row["InjHdr_MobileRedirect_V1"].String; string SpamJunkRuleID = row["SpamJunkRuleID"].String; string MobileOkClassifier = string.IsNullOrEmpty(InjHdr_MobileOkClassifier_v1) ? InjHdr_MobileOkX_v1 : InjHdr_MobileOkClassifier_v1; int MobileClassifier = 0; if (string.IsNullOrEmpty(DUPipeline_MobileUrls) == false || string.IsNullOrEmpty(DUPipeline_ResponsiveDesignSpans) == false || string.IsNullOrEmpty(DUV2_MobileUrl) == false || string.IsNullOrEmpty(InjHdr_MobileRedirect_V1) == false || Utility.CJKVersionMobileFriendly(Category) || (string.IsNullOrEmpty(MobileOkClassifier) == false && MobileOkClassifier == "1")) { MobileClassifier = 1; } else if (Utility.CJKVersionMobileUnFriendly(Category) || (string.IsNullOrEmpty(MobileOkClassifier) == false && MobileOkClassifier == "3")) { MobileClassifier = 3; } else if (string.IsNullOrEmpty(MobileOkClassifier) == false && MobileOkClassifier == "2") { MobileClassifier = 2; } output["Url"].Set(Url); output["MobileClassifier"].Set(MobileClassifier); yield return output; } } }
4,对IE和bing进行union,然后对相同的query进行合并。
Scope
//Script GUID:5a517444-605b-4f45-ae1b-4a95e6a5fc0a //Used for tracking history ie = EXTRACT Query : string, Url : string, Count : int FROM @"/my/entityranker/oneDay/IEqueryclickurlpairs.wsv" USING DefaultTextExtractor(); bing = EXTRACT Query : string, Url : string, Count : int FROM @"/my/entityranker/oneDay/bingqueryclickurlpairsTime.wsv" USING DefaultTextExtractor(); union_all = SELECT * FROM ie UNION ALL SELECT * FROM bing; result = SELECT Query, Url, SUM(Count) AS NewCount FROM union_all ORDER BY Query; OUTPUT result TO @"/my/entityranker/oneDay/UnionResultSorted.wsv";
5,
6,得到了query->Category之后,要算那个category出现的最多。
每一条出现的地方*clickCount然后累加起来。
这里用到了reduce来做。
Intern---Microsoft Academic China Team
标签:
原文地址:http://www.cnblogs.com/yueyebigdata/p/5729713.html