码迷,mamicode.com
首页 > 其他好文 > 详细

Intern---Microsoft Academic China Team

时间:2016-08-02 17:04:38      阅读:143      评论:0      收藏:0      [点我收藏+]

标签:

项目 一、项目需求:对搜索关键词进行类别的统计分析,为了后面的entity-rank做准备。

0,各种关键数据统计:

数据量:1个月数据:about 1000T。

 

1,对IE的所有浏览搜索的提取代码:

Scope:

技术分享
//Script GUID:ad2766d3-7aec-4ffa-9bbd-ec2740361999
//Used for tracking history
REFERENCE "/shares/searchWebLoad/RetroIndex/bin/ManagedUrlNormalizer.dll";
RESOURCE "/shares/searchWebLoad/RetroIndex/bin/NativeUrlNormalizer.dll";
UnifiedViewRaw =
    VIEW "/shares/searchDM/distrib/released/CompetitiveUnifiedView/CompetitiveUnifiedPageView.view"
    PARAMS
    (
        Start = @"2016-07-26",
        End = @"2016-07-26",
        Source = @"All"
//      Source = @"DesktopIE"
    );
ClickData =
    SELECT Page_FromPage.Query,
           RetroIndex.UrlNormalizer.GetNormalizedStringNoThrow(Encoding.UTF8.GetBytes(Request_Url)) AS NormalizedUrl,
           COUNT() AS Count
    FROM UnifiedViewRaw
    WHERE Page_FromPage.IsQuery == true AND Page_FromPage.Vertical.ToLower() == "web" AND NOT Request_IsQuery AND Page_FromPage.Market.ToLower() == "zh-cn"
    HAVING Count >= 2 AND Query != "" AND NormalizedUrl != "" AND NormalizedUrl != null;

    // Page_FromPage.IsQuery: True if the page is a query page
    // Vertical: Search Vertical of this PageView  
    // Request_IsQuery bool: True if this page view is search engine result page

OUTPUT
TO @"/my/entityranker/oneDay/IEqueryclickurlpairsAll.wsv";
View Code

C#:

技术分享
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using ScopeRuntime;

public class CMyUtils
{
    static public string NormalizeURL(string url)
    {
        url = url.ToLower();
        if (url.StartsWith("http://"))
        {
            url = url.Substring("http://".Length);
        }
        else if (url.StartsWith("https://"))
        {
            url = url.Substring("https://".Length);
        }
        if (url.StartsWith("www."))
        {
            url = url.Substring("www.".Length);
        }
        if (url.EndsWith("/"))
        {
            url = url.Substring(0, url.Length - 1);
        }
        return url;
    }

    static public string GetHost(string url)
    {
        url = NormalizeURL(url);
        int slashPosition = url.IndexOf(/);
        if (slashPosition >= 0)
        {
            url = url.Substring(0, slashPosition);
        }
        return url;
    }
}

public class TopReducer : Reducer
{
    public override Schema Produces(string[] columns, string[] args, Schema input)
    {
        return input.Clone();
    }

    public override IEnumerable<Row> Reduce(RowSet input, Row output, string[] args)
    {
        int count = 0;
        foreach (Row row in input.Rows)
        {
            if (++count <= 100)
            {
                row.Copy(output);
                yield return output;
            }
        }
    }
}
View Code

2,对bing的所有浏览搜索的提取代码:

Scope:

技术分享
//Script GUID:8e9ba2e3-8288-49e1-a5ff-7776d653ae16
//Used for tracking history
REFERENCE "/local/IndexQualityCJK/wb/WordBreaker.dll";
RESOURCE "/local/IndexQualityCJK/wb/unzip.exe";
RESOURCE "/local/IndexQualityCJK/wb/wordbreak.zip";

REFERENCE @"/shares/searchDM/distrib/released/SLAPI/SearchLogApi.dll";
REFERENCE @"/shares/searchDM/distrib/released/SLAPI/Serializer.exe";
REFERENCE @"/shares/searchDM/distrib/released/SLAPI/Microsoft.Live.Json.dll";

REFERENCE "/shares/searchWebLoad/RetroIndex/bin/ManagedUrlNormalizer.dll";
RESOURCE "/shares/searchWebLoad/RetroIndex/bin/NativeUrlNormalizer.dll";


USING MS.Internal.Bing.DataMining.SearchLogApi;
USING Microsoft.Live.Json;

SlapiPageView =    
      VIEW "/shares/searchDM/distrib/released/SLAPI/SearchLogPageView.view"
       //VIEW "/shares/searchDM/distrib/released/SLAPI/SearchLogSessionView.view"
       //PARAMS (Start = @"2013-09-01", End = @"2013-09-01", UseSample=false, Dataset="Mobile");
       PARAMS (Start = "2016-07-26", End = "2016-07-26", UseSample=false, Dataset= "Mobile");

ZHCNTraffic =
    SELECT WordBreaker.BreakWords(Query_RawQuery, "zh-cn") AS Query,
           Request_RequestTime.ToString("yyyy-MM-dd") AS QDate,
           Page_Entities_WebResults
    FROM SlapiPageView
    WHERE Request_IsBotVNext == false AND Request_IsMarketingTraffic == false
          AND string.IsNullOrEmpty(Market) == false AND Market.ToLower() == "zh-cn"
          AND string.IsNullOrEmpty(Vertical) == false AND Vertical.ToLower() == "web";
//AND string.IsNullOrEmpty(FormCode) == false AND (FormCode.ToUpper() == "QBLH" OR FormCode.ToUpper() == "QBRE" OR FormCode.ToUpper() == "MSNBHP" OR FormCode.ToUpper() == "MSNFLH" OR FormCode.ToUpper() == "BCNASI");

ProcessWebEntity =
    PROCESS ZHCNTraffic
    USING FEXLogSimpleExtractor;

ClickQueryUrlPairs =
    SELECT Query,
           RetroIndex.UrlNormalizer.GetNormalizedStringNoThrow(Encoding.UTF8.GetBytes(Url)) AS NormalizedUrl,
           COUNT() AS PairCount
    FROM ProcessWebEntity
    WHERE Click > 0 HAVING PairCount >= 2 AND Query != "" AND NormalizedUrl != "" AND NormalizedUrl != null;

OUTPUT
TO @"/my/entityranker/oneDay/bingqueryclickurlpairsTime.wsv";
View Code

C#:

技术分享
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using ScopeRuntime;
using MS.Internal.Bing.DataMining.SearchLogApi;

public class URLUtility
{
    static public string NormalizeURL(string url)
    {
        url = url.ToLower();
        if (url.StartsWith("http://"))
        {
            url = url.Substring("http://".Length);
        }
        else if (url.StartsWith("https://"))
        {
            url = url.Substring("https://".Length);
        }
        if (url.StartsWith("www."))
        {
            url = url.Substring("www.".Length);
        }
        if (url.EndsWith("/"))
        {
            url = url.Substring(0, url.Length - 1);
        }
        return url;
    }

    static public string GetHost(string url)
    {
        url = NormalizeURL(url);
        int slashPosition = url.IndexOf(/);
        if (slashPosition >= 0)
        {
            url = url.Substring(0, slashPosition);
        }
        return url;
    }
}

public class FEXLogSimpleExtractor : Processor
{
    public override Schema Produces(string[] columns, string[] args, Schema input)
    {
        return new Schema("Query:string,QueryDate:string,Url:string,Host:string,POS:int,Click:int");
    }

    public override IEnumerable<Row> Process(RowSet input, Row output, string[] args)
    {
        foreach (Row row in input.Rows)
        {
            string Query = row["Query"].String;
            string QueryDate = row["QDate"].String;
            var WebEntities = row["Page_Entities_WebResults"].Value as MS.Internal.Bing.DataMining.SearchLogApi.WebResultList;
            
            for (int i = 0; i < WebEntities.Count;i++ )
            {
                string Url = WebEntities[i].TitleUrl;
                string Host = URLUtility.GetHost(Url);
                int Pos = WebEntities[i].PositionOfEntityInTopLevelRegion;
                int Click = WebEntities[i].Clicks.Count;

                if (Pos == 1)
                {
                    output["Query"].Set(Query);
                    output["QueryDate"].Set(QueryDate);
                    output["Url"].Set(Url);
                    output["Host"].Set(Host);
                    output["POS"].Set(Pos);
                    output["Click"].Set(Click);

                    yield return output;
                }
            }
        }
    }
}
View Code

3,搜索查询和分类的提取代码:

技术分享
//Script GUID:5b7abb8b-defd-4f2b-b703-9882ee6e960b
//Used for tracking history
REFERENCE @"/shares/searchWebLoad/RetroIndex/bin/RetroIndexProcessor.dll";
REFERENCE @"/shares/searchWebLoad/RetroIndex/bin/ManagedUrlNormalizer.dll";
RESOURCE @"/shares/searchWebLoad/RetroIndex/bin/NativeUrlNormalizer.dll";

USING RetroIndex;

Snapshot =
    VIEW "/shares/searchWebLoad/RetroIndex/Views/LatestSnapshot.view"
    PARAMS
    (
        Sample = false,
        TierFlag = 3
    );

SELECT Url,
       Header,
       Body,
       HttpHeader,
       CodePage
FROM Snapshot;

Uberchunk =
    PROCESS
    PRODUCE Url,
            Country,
            Language,
            Category
    USING RetroIndexProcessor
    HAVING string.IsNullOrEmpty(Language) == false && Language.ToLower() == "zh_chs";


OUTPUT
TO @"/local/IndexCJK/MobileTopSites/UrlCategory.wsv";
View Code

C#:

技术分享
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using ScopeRuntime;

public class Utility
{
    public static bool CJKVersionMobileFriendly(string category)
    {
        if (string.IsNullOrEmpty(category))
        {
            return false;
        }

        string[] cates = category.Split(new string[] { "#TAB#" }, StringSplitOptions.RemoveEmptyEntries);
        foreach (string cate in cates)
        {
            if(cate.StartsWith("aa00") &&
                (cate.EndsWith("Mobi")
                || cate.EndsWith("CrossDevice")
                || cate.EndsWith("MobileFriendly")
                || cate.EndsWith("MobileUnFriendly")))
            {
                return true;
            }
        }
        return false;
    }

    public static bool CJKVersionMobileUnFriendly(string category)
    {
        if (string.IsNullOrEmpty(category))
        {
            return false;
        }

        string[] cates = category.Split(new string[] { "#TAB#" }, StringSplitOptions.RemoveEmptyEntries);
        foreach (string cate in cates)
        {
            if (cate.StartsWith("aa00") && cate.EndsWith("MobileUnFriendly"))
            {
                return true;
            }
        }

        return false;
    }
}

public class CJKVersionMobileOkClassifierProcessor : Processor
{
    public override Schema Produces(string[] columns, string[] args, Schema input)
    {
        return new Schema("Url:string, MobileClassifier:int");
    }

    public override IEnumerable<Row> Process(RowSet input, Row output, string[] args)
    {
        foreach (Row row in input.Rows)
        {
            string Url = row["Url"].String;
            string Language = row["Language"].String;

            if (!(string.IsNullOrEmpty(Url) == false && string.IsNullOrEmpty(Language) == false && Language.ToLower() == "zh_chs"))
            {
                continue;
            }

            //classifier features
            string Category = row["Category"].String;
            string DUPipeline_MobileUrls = row["DUPipeline_MobileUrls"].String;
            string DUPipeline_ResponsiveDesignSpans = row["DUPipeline_ResponsiveDesignSpans"].String;
            string DUV2_MobileUrl = row["DUV2_MobileUrl"].String;
            string InjHdr_MobileOkClassifier_v1 = row["InjHdr_MobileOkClassifier_v1"].String;
            string InjHdr_MobileOkX_v1 = row["InjHdr_MobileOkX_v1"].String;
            string InjHdr_MobileRedirect_V1 = row["InjHdr_MobileRedirect_V1"].String;
            string SpamJunkRuleID = row["SpamJunkRuleID"].String;

            string MobileOkClassifier = string.IsNullOrEmpty(InjHdr_MobileOkClassifier_v1) ? InjHdr_MobileOkX_v1 : InjHdr_MobileOkClassifier_v1;

            int MobileClassifier = 0;
            if (string.IsNullOrEmpty(DUPipeline_MobileUrls) == false
                || string.IsNullOrEmpty(DUPipeline_ResponsiveDesignSpans) == false
                || string.IsNullOrEmpty(DUV2_MobileUrl) == false
                || string.IsNullOrEmpty(InjHdr_MobileRedirect_V1) == false
                || Utility.CJKVersionMobileFriendly(Category)
                || (string.IsNullOrEmpty(MobileOkClassifier) == false && MobileOkClassifier == "1"))
            {
                MobileClassifier = 1;
            }
            else if (Utility.CJKVersionMobileUnFriendly(Category) || (string.IsNullOrEmpty(MobileOkClassifier) == false && MobileOkClassifier == "3"))
            {
                MobileClassifier = 3;
            }
            else if (string.IsNullOrEmpty(MobileOkClassifier) == false && MobileOkClassifier == "2")
            {
                MobileClassifier = 2;
            }
            
            output["Url"].Set(Url);
            output["MobileClassifier"].Set(MobileClassifier);
            yield return output;
        }
    }
}
View Code

4,对IE和bing进行union,然后对相同的query进行合并。

Scope

技术分享
//Script GUID:5a517444-605b-4f45-ae1b-4a95e6a5fc0a
//Used for tracking history

ie =
    EXTRACT Query : string,
            Url : string,
            Count : int
    FROM @"/my/entityranker/oneDay/IEqueryclickurlpairs.wsv"
    USING DefaultTextExtractor();
bing =
    EXTRACT Query : string,
            Url : string,
            Count : int
    FROM @"/my/entityranker/oneDay/bingqueryclickurlpairsTime.wsv"
    USING DefaultTextExtractor();

union_all =
    SELECT *
    FROM ie
    UNION ALL
    SELECT *
    FROM bing;

result =
    SELECT Query,
           Url,
           SUM(Count) AS NewCount
    FROM union_all ORDER BY Query;
OUTPUT result
TO @"/my/entityranker/oneDay/UnionResultSorted.wsv";
View Code

5,

 

6,得到了query->Category之后,要算那个category出现的最多。

每一条出现的地方*clickCount然后累加起来。

这里用到了reduce来做。

 

Intern---Microsoft Academic China Team

标签:

原文地址:http://www.cnblogs.com/yueyebigdata/p/5729713.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!