【转载】用sas抓取网页数据简易版

时间：2015-12-28 10:33:34 阅读：305 评论：0 收藏：0 [点我收藏+]

标签：

链接：http://www.1point3acres.com/bbs/thread-91000-1-1.html

如果是一些比较简单的规则的网页抓取，可以用SAS，纯属娱乐，SAS入门的话推荐SAS BASE和ADVANCE认证的教材，这两个认证其实没啥用，但教材的内容对于一个专业的SAS Programmer足够了，sas data step, sql, macro，会这些日常处理应该都没问题了。
%macro webScholar;
/*memlib选项是将data放在内存里，而不是放在D盘，可以提高读取速度，最后别忘记把dataset存盘*/
libname mywork "D:\" memlib;

/*建立存取结果的空数据集，抓取google学术搜索出来的标题和应用次数*/
proc sql;
create table mywork.results_web (titles char(500), citenumber char(500));
quit;

/*pageno相当于google学术搜索下面的第几页，这里试着抓取前两页，当然可以加*/
/*q=python 后的python可以换成其他的关键词*/
%do pageno = 0 %to 20 %by 10;. From 1point 3acres bbs
        data _null_;
                length url $ 256;
                url = ‘http://scholar.google.com/scholar?start=0&q=python&hl=en&as_sdt=0,5‘;
                url = prxchange("s/start=0/start=&pageno/", 1, url);
                call symput("url", url);
        run;

/*recfm=n是将input分成长度为256的小块，因为sas字符变量最长3万多，超过的话会被截断，有时网页代码的一行会很长*/
filename web url "%superq(url)" recfm=n debug;

       /*$varying 这个format/informat很有意思，具体可以参考help*/
        data mywork.web;
                length webtext $ 256;
                infile web length=len;
                input webtext $varying256.len;
                textlength = len;
        run;

        data mywork.extracted;
                length s $ 32767; /*sas能处理的最长字符变量*/
                length r $ 500;
                length cite $500;
                retain s; /*每次data步，将字符累加到s中，用了retain，s不会重置成缺失值*/
                set mywork.web;
                s = cats(s, webtext);
. From 1point 3acres bbs
                /*用正则表达式来匹配标题和文献引用次数*/
                /*其他编程语言的话可以找到很多package来做，sas这一点不太方便*/
                position = .;
                do until (position = 0);
                        patternID = prxparse(‘/<h3(\w|\W)*?<\/h3>(\w|\W)*?>Cite(d by )??\d*<\/a>/i‘);
                        call prxsubstr(patternID, s, position, length);
                        if position ^= 0 then do;.
                                patternID = prxparse(‘/<h3(\w|\W)*?<\/h3>/i‘);
                                call prxsubstr(patternID, s, position, length);
                                r = substr(s, position, length);. 1point3acres.com/bbs

/*把标题中的tag之类的奇怪字符去掉*/

                                r = prxchange(‘s/(<[^>]*?>)|(\[[^\]]*?\])|(&[^;]*?;s?)//‘, -1, r);
                                s = substrn(s, position + length);

                                patternID = prxparse(‘/>Cite(d by )??\d*<\/a>/i‘);
                                call prxsubstr(patternID, s, position, length);
                                cite = substr(s, position, length);

                                /*把数字提取出来*/
                                cite = prxchange(‘s/(\D*)(\d*)(\D*)/$2/‘,1, cite);
                                s = substrn(s, position + length);
                                output;
                        end;
                end;                . From 1point 3acres bbs
                if length(s) > 29000 then s = substrn(s, 257);
        run;

       /*将结果存起来，最后的数据中会有两个变量，论文的标题和引用次数*/
       /*因为开头memlib选项，这个dataset并没有存到硬盘中*/
        proc sql;
                insert into mywork.results_web
                select r, cite from mywork.extracted;. from: 1point3acres.com/bbs
        quit;

%end;.

%mend webScholar;

%webScholar

【转载】用sas抓取网页数据简易版

标签：

原文地址：http://www.cnblogs.com/yizhenfeng/p/5081658.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行