热门关键字:  ubuntu  分区  函数  Fedora  linux系统进程

python使用google和baidu进行网页search

来源: 作者: 时间:2008-02-21 Tag: 点击:

==================

$ cat baidu.py
#!/usr/bin/env python
#-*- encoding:utf-8 -*-

import sys
import re
import httplib

def request_and_save(conn, query_str, f):
        conn.request("GET", query_str)
        r1 = conn.getresponse()
        line = r1.read()

        p1 = re.compile("<a onclick=\"return")
        p2 = re.compile("</a>")
        match_start = 0
        match_end = 0

        while line:
                m = p1.search(line)
                if m is None:
                        break
                match_start = m.start()
                line = line[match_start:]
                m = p2.search(line)
                if m is None:
                        break
                match_end = m.end()
                item = line[:match_end]
                f.write(item)
                f.write("<br>\n");
                line = line[match_end:]
# end function

if len(sys.argv) < 2:
        print "Usage: baidu.py words"
        sys.exit(0)

query_str = "/s?wd=" + sys.argv[1] + "\&cl=3"
query_str2 = "/s?lm=0\&si=\&rn=10\&ct=0\&wd=" + sys.argv[1] + "\&pn=10\&ver=0\&cl=3"
query_str3 = "/s?lm=0\&si=\&rn=10\&ct=0\&wd=" + sys.argv[1] + "\&pn=20\&ver=0\&cl=3"
query_str4 = "/s?lm=0\&si=\&rn=10\&ct=0\&wd=" + sys.argv[1] + "\&pn=30\&ver=0\&cl=3"
query_str5 = "/s?lm=0\&si=\&rn=10\&ct=0\&wd=" + sys.argv[1] + "\&pn=40\&ver=0\&cl=3"
query_str6 = "/s?lm=0\&si=\&rn=10\&ct=0\&wd=" + sys.argv[1] + "\&pn=50\&ver=0\&cl=3"
query_str7 = "/s?lm=0\&si=\&rn=10\&ct=0\&wd=" + sys.argv[1] + "\&pn=60\&ver=0\&cl=3"
query_str8 = "/s?lm=0\&si=\&rn=10\&ct=0\&wd=" + sys.argv[1] + "\&pn=70\&ver=0\&cl=3"

html_header = "<html><head><meta http-equiv=\"content-type\" content=\"text/html;charset=gb2312\"><title>kf701 python search tool</title></head><body>\n"
html_header += "<p align=center><font size=3>kf701 python search tool</font></p>"
html_end = "</body></html>"

conn = httplib.HTTPConnection("www.baidu.com")

print 'Search ' + sys.argv[1] + ', Save result in ' +  sys.argv[1] + '-search.html'
f = file( sys.argv[1] + "-search.html", "w")
f.write(html_header)

request_and_save(conn, query_str, f)
request_and_save(conn, query_str2, f)
request_and_save(conn, query_str3, f)
request_and_save(conn, query_str4, f)
request_and_save(conn, query_str5, f)
request_and_save(conn, query_str6, f)
request_and_save(conn, query_str7, f)
request_and_save(conn, query_str8, f)

f.write(html_end)
f.close()
conn.close()
最新评论共有 4 位网友发表了评论
发表评论
评论内容:不能超过250字,需审核,请自觉遵守互联网相关政策法规。
用户名: 密码:
匿名?
注册
栏目列表