python-toolkit

整理部分代码

python常见代码的归纳总结

判断文件或者文件夹是否存在

if(os.path.exists(rootdir) == False)

创建文件夹

os.mkdir(rootdir)

调用系统命令

os.system(cmd)

字典循环

for key,value in dict.items()

打开文件并读取内容进行处理

fd = open('xxxx.txt', encoding='utf-8')
for line in fd:
    print line
fd.close()

创建文件并写入内容

fd = open('xxxx.txt', 'a+', encoding='utf-8')
fd.write('aaaaa' + '\n')
fd.close()

使用xlrd读取EXCEL

导入

import xlrd

打开excel

data = xlrd.open_workbook('demo.xls')     # 注意这里的workbook首字母是小写

查看文件中包含sheet的名称

data.sheet_names()

得到第一个工作表,或者通过索引顺序 或 工作表名称

table = data.sheets()[0]
table = data.sheet_by_index(0)
table = data.sheet_by_name(u'Sheet1')

获取行数和列数

nrows = table.nrows
ncols = table.ncols

获取整行和整列的值(数组)

table.row_values(i)
table.col_values(i)

循环行,得到索引的列表

for rownum in range(table.nrows):
    print table.row_values(rownum)

单元格

cell_A1 = table.cell(0,0).value
cell_C4 = table.cell(2,3).value

分别使用行列索引

cell_A1 = table.row(0)[0].value
cell_A2 = table.col(1)[0].value

简单的写入

row = 0
col = 0
ctype = 1 # 类型 0 empty,1 string, 2 number, 3 date, 4 boolean, 5 error
value = 'lixiaoluo'
xf = 0 # 扩展的格式化 (默认是0)
table.put_cell(row, col, ctype, value, xf)
table.cell(0,0) # 文本:u'lixiaoluo'
table.cell(0,0).value # 'lixiaoluo'

使用xlwt写入EXCEL

导入xlwt

import xlwt

新建一个excel文件

file = xlwt.Workbook() #注意这里的Workbook首字母是大写,无语吧

新建一个sheet

table = file.add_sheet('sheet name')

写入数据table.write(行,列,value)

table.write(0,0,'test')

如果对一个单元格重复操作,会引发

returns error:
# Exception: Attempt to overwrite cell:
# sheetname=u'sheet 1' rowx=0 colx=0

所以在打开时加cell_overwrite_ok=True解决

table = file.add_sheet('sheet name',cell_overwrite_ok=True)

保存文件

file.save('demo.xls')

另外,使用style

style = xlwt.XFStyle() #初始化样式

font = xlwt.Font() #为样式创建字体

font.name = 'Times New Roman'

font.bold = True

style.font = font #为样式设置字体

table.write(0, 0, 'some bold Times text', style) # 使用样式

命令行 getopt

try:
     options,args = getopt.getopt(sys.argv[1:],"hp:i:",["help","ip=","port="])
except getopt.GetoptError:
     sys.exit()
for name,value in options:
     if name in ("-h","--help"):
          usage()
     if name in ("-i","--ip"):
          print(value)
     if name in ("-p","--port"):
          print(value)

简单爬虫

import requests
AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
HEADERS = {
        'User-Agent': AGENT,
        'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
        'X-Requested-With':'XMLHttpRequest',
        'Accept':'*/*'
session = requests.session()

模拟登录

postdata = {
    'defaults':'xxx',
    'fromLogin':'xxx',
    'userName':'xxx',
    'password':'xxxx'
}
url = 'xxxxxxxx'
login_info = session.post(url, headers = HEADERS, data = postdata,verify = False)
if(login_info.status_code == requests.codes.ok):
    print('login success')
    return True
else:
    print('login err')
    return False
}

下载html页面

def downloadUrl(rootdir, url, orgid, page):
    html = session.get(url, headers=global_config.HEADERS, verify=False)
    if(html.text[1:7] == 'script'):
        print(html.text)
        return "err"
    if(len(html.text) < 60):
        return "err"
    sample = open(rootdir + "/" + str(orgid) + '_' + str(page) + ".html", "w", encoding='utf-8')
    sample.write(html.text)
    sample.close()
    return 'ok'

解析JOSN文件内容

def scrapy_by_file(json_file_name):
    #读取JSON文件的内容
    text = open(json_file_name, encoding='utf-8').read()
    #特殊处理,去除从WINDOWS系统带过来的BOM特殊字符
    if text.startswith(u'\ufeff'):
        text = text.encode('utf8')[3:].decode('utf8')
    #将文本内容的JSON数据转换成自定义的JSON对象
    try:
        json_data = json.loads(text)
    except:
        print(json_file_name)
        return
    for row in json_data['rows']:

def scrapy_by_row(row):
    try:
        orgid = row['organization']['id']
        familyid = row['censusRegisterFamily']['id']
    except:
        print('errrr')
        return
        scrapy_by_row(row)

遍历文件夹

遍历目录(rootdir) 遍历到的每个文件都执行dirFunc

def waklThroughDir(rootdir, dirFunc):
    for parent, dirnames, filenames in os.walk(rootdir):
        for filename in filenames:
            print(filename)
            #获取后缀为txt的文件
            if(filename.split('.')[-1] == 'html'):
                dirFunc(os.path.join(parent, filename))

##采集温州房产网基本信息

# -*- coding: utf-8 -*-
import re
import requests
import time

#-----------------------------用于解析的正则表达式常量------------------------------------------------------------------
#解析页数
PAGE_NUM = '共找到 (.*?) 符合条件的记录'
#解析小区名称
NAME = 'texttext_title"><ahref(.*?)</a></div><divclass="texttext_moreinfo">'
#解析小区价格
PRICE = 'class="hot_price">(.*?)</span>'
#解析小区地址
ADDRESS = 'text_moreinfo">(.*?)</div><divclass="texttext_moreinfo"><span>'
#文件生成路径
ROOTDIR = 'F:\\test\\'

#-----------------------------模拟请求的头部信息,否则将被识别出是程序抓包而被拦截--------------------------------------
HEADERS = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, sdch',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36',
    'Host': 'www.0577home.net',
    'Upgrade-Insecure-Requests': '1'
}

#-----------------------------抓取某一页的房产信息,pageNo为页号--------------------------------------------------------
def getHouseListByPageno(pageNo):
    #建立一个连接用于后续发起请求
    session = requests.session()
    url = 'http://www.0577home.net/xiaoqu/list_0_0_0_0_0_0_0_' + str(pageNo) + '.html'
    houseList = session.get(url, headers = HEADERS, verify = False)
    #以写入模式打开文件
    fh = open(ROOTDIR + "houseList_pageNo" + str(pageNo) + ".txt",  'w' ,encoding='utf-8')
    #将movieList写入文件
    fh.write(houseList.text)
    #关闭文件
    fh.close()

#-------------------------------获取需要抓取的页面总数------------------------------------------------------------------
def getPageNum():
    #打开已经下载好的第一页房产内容
    f = open(ROOTDIR + 'houseList_pageNo1.txt', encoding='utf-8')
    #获取文件内容
    rawContent = f.read()
    #用正则表达式解析页面内容
    pageNum = re.findall(PAGE_NUM, rawContent)
    #返回页面号
    return int(pageNum[0]) / 20 + 1

def parseHouseListToFile(srcFile, dstFile):
    #打开待解析的文件
    f = open(srcFile, encoding='utf-8')
    #读取文件内容以备解析
    rawContent = f.read()
    p = re.compile('\s+')
    content = re.sub(p, '', rawContent)
    dnames = re.findall(NAME, content)
    names = []
    for dname in dnames:
        idx = dname.rfind('>')
        names.append(dname[idx + 1:])
    prices = re.findall(PRICE, content)
    daddress = re.findall(ADDRESS, content)
    address = []
    for daddr in daddress:
        id = daddr.rfind('>')
        address.append(daddr[id + 1:])
    i = 0
    for x in names:
        #写入时用'$'做分割,结尾加上回车符
        dstFile.write(names[i] + '$' + prices[i] + '$' + address[i] + '\n')
        i = i + 1

# -------------------------------主函数,下载并解析房产信息--------------------------------------------------------------
if __name__ == '__main__':
    #---------------------抓取页面-----------------------------
    #抓取第一页房产信息
    getHouseListByPageno(1)
    #通过第一页房产信息获取总共要抓取的页面数量
    pageNum = getPageNum()
    #抓取剩余的页面
    for i in range(2, int(pageNum) + 1):
        getHouseListByPageno(str(i))
    #---------------------解析页面-----------------------------
    #获取当前年月日
    localtime = time.strftime('%Y%m%d', time.localtime(time.time()))
    #创建一个文件,文件名前面带上年月日
    f = open(ROOTDIR + localtime + '_houseList.txt', 'a+', encoding='utf-8')
    #解析所有的页面
    #for k in range(1, int(pageNum) + 1):
    for k in range(1, 115):
        parseHouseListToFile(ROOTDIR + "houseList_pageNo" + str(k) + ".txt", f)
    #关闭文件
    f.close()
    f = open(ROOTDIR + localtime + '_houseList.txt', encoding='utf-8')
    fd = open(ROOTDIR + localtime + '_houseInfo.txt', 'w', encoding='utf-8')
    k = 0
    for line in f:
        data = line.strip('\n')
        data = data.split('$')
        idx = data[3]
        getHouseInfoByPageno(idx, k)
        houseInfo = parseHouseInfo(ROOTDIR + "houseInfo_pageNo" + str(idx) + ".html")
        print(str(k) + "$".join(data) + '$' + "$".join(houseInfo))
        fd.write("$".join(data) + '$' + "$".join(houseInfo) + '\n')
        k += 1
    f.close()
    fd.close()

调用java

import sys
import jpype

name = sys.argv[1]
jarpath = '/home/dsadm/why/python'
jpype.startJVM(jpype.getDefaultJVMPath(), "-Djava.ext.dirs=%s" % jarpath)
DECRYPT = jpype.JClass('why.fmrt.decrypt.DECRYPT')
upperName =DECRYPT.decrypt(name)
print(upperName)
jpype.shutdownJVM()

构建 web 页面

import os

import tornado.httpserver
import tornado.ioloop
import tornado.options
import tornado.web

from view import *

from tornado.options import define, options
define("port", default=8000, help="run on the given port", type=int)

class Application(tornado.web.Application):
    def __init__(self):
        handlers = [
            (r"/", Indexhandler),
        ]
        settings = dict(
            template_path=os.path.join(os.path.dirname(__file__), 'templates'),
            autoescape=None,
            debug=False,
        )
        tornado.web.Application.__init__(self, handlers, **settings)

if __name__ == "__main__":
    tornado.options.parse_command_line()
    http_server = tornado.httpserver.HTTPServer(Application(), xheaders=True)
    http_server.listen(options.port)
    tornado.ioloop.IOLoop.instance().start()

多进程

import multiprocessing
for process_id in range(PROCESS_NUM):
    p = multiprocessing.Process(target=worker, args=(process_id,))
    jobs.append(p)
    p.start()

多线程

# -*- coding: utf-8 -*-

import threading

class Threadconfig():
    def __init__(self, thread_size):
        self.thread_size = thread_size

    def topen(self):
        self.thread_tasks = []

    def build(self, func, **kwargs):
        self.thread_task = threading.Thread(target=func, kwargs=(kwargs))
        self.thread_tasks.append(self.thread_task)

    def run(self):
        for thread_task in self.thread_tasks:
            thread_task.setDaemon(True)
            thread_task.start()
        while 1:
            alive = False
            for thread_num in range(0, self.thread_size):
                alive = alive or self.thread_tasks[thread_num].isAlive()
            if not alive:
                break

    def __del__(self):
        self.thread_tasks = []

移除中文分隔符号

  cmd = "sed ':a;N;$ s/\\r\\n//g;ba' " + oldfile + " > " + newfile
os.system(cmd)