整理部分代码
python常见代码的归纳总结
判断文件或者文件夹是否存在
if(os.path.exists(rootdir) == False)
创建文件夹
os.mkdir(rootdir)
调用系统命令
os.system(cmd)
字典循环
for key,value in dict.items()
打开文件并读取内容进行处理
fd = open('xxxx.txt', encoding='utf-8')
for line in fd:
print line
fd.close()
创建文件并写入内容
fd = open('xxxx.txt', 'a+', encoding='utf-8')
fd.write('aaaaa' + '\n')
fd.close()
使用xlrd读取EXCEL
导入
import xlrd
打开excel
data = xlrd.open_workbook('demo.xls') # 注意这里的workbook首字母是小写
查看文件中包含sheet的名称
data.sheet_names()
得到第一个工作表,或者通过索引顺序 或 工作表名称
table = data.sheets()[0]
table = data.sheet_by_index(0)
table = data.sheet_by_name(u'Sheet1')
获取行数和列数
nrows = table.nrows
ncols = table.ncols
获取整行和整列的值(数组)
table.row_values(i)
table.col_values(i)
循环行,得到索引的列表
for rownum in range(table.nrows):
print table.row_values(rownum)
单元格
cell_A1 = table.cell(0,0).value
cell_C4 = table.cell(2,3).value
分别使用行列索引
cell_A1 = table.row(0)[0].value
cell_A2 = table.col(1)[0].value
简单的写入
row = 0
col = 0
ctype = 1 # 类型 0 empty,1 string, 2 number, 3 date, 4 boolean, 5 error
value = 'lixiaoluo'
xf = 0 # 扩展的格式化 (默认是0)
table.put_cell(row, col, ctype, value, xf)
table.cell(0,0) # 文本:u'lixiaoluo'
table.cell(0,0).value # 'lixiaoluo'
使用xlwt写入EXCEL
导入xlwt
import xlwt
新建一个excel文件
file = xlwt.Workbook() #注意这里的Workbook首字母是大写,无语吧
新建一个sheet
table = file.add_sheet('sheet name')
写入数据table.write(行,列,value)
table.write(0,0,'test')
如果对一个单元格重复操作,会引发
returns error:
# Exception: Attempt to overwrite cell:
# sheetname=u'sheet 1' rowx=0 colx=0
所以在打开时加cell_overwrite_ok=True解决
table = file.add_sheet('sheet name',cell_overwrite_ok=True)
保存文件
file.save('demo.xls')
另外,使用style
style = xlwt.XFStyle() #初始化样式
font = xlwt.Font() #为样式创建字体
font.name = 'Times New Roman'
font.bold = True
style.font = font #为样式设置字体
table.write(0, 0, 'some bold Times text', style) # 使用样式
命令行 getopt
try:
options,args = getopt.getopt(sys.argv[1:],"hp:i:",["help","ip=","port="])
except getopt.GetoptError:
sys.exit()
for name,value in options:
if name in ("-h","--help"):
usage()
if name in ("-i","--ip"):
print(value)
if name in ("-p","--port"):
print(value)
简单爬虫
import requests
AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
HEADERS = {
'User-Agent': AGENT,
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'X-Requested-With':'XMLHttpRequest',
'Accept':'*/*'
session = requests.session()
模拟登录
postdata = {
'defaults':'xxx',
'fromLogin':'xxx',
'userName':'xxx',
'password':'xxxx'
}
url = 'xxxxxxxx'
login_info = session.post(url, headers = HEADERS, data = postdata,verify = False)
if(login_info.status_code == requests.codes.ok):
print('login success')
return True
else:
print('login err')
return False
}
下载html页面
def downloadUrl(rootdir, url, orgid, page):
html = session.get(url, headers=global_config.HEADERS, verify=False)
if(html.text[1:7] == 'script'):
print(html.text)
return "err"
if(len(html.text) < 60):
return "err"
sample = open(rootdir + "/" + str(orgid) + '_' + str(page) + ".html", "w", encoding='utf-8')
sample.write(html.text)
sample.close()
return 'ok'
解析JOSN文件内容
def scrapy_by_file(json_file_name):
#读取JSON文件的内容
text = open(json_file_name, encoding='utf-8').read()
#特殊处理,去除从WINDOWS系统带过来的BOM特殊字符
if text.startswith(u'\ufeff'):
text = text.encode('utf8')[3:].decode('utf8')
#将文本内容的JSON数据转换成自定义的JSON对象
try:
json_data = json.loads(text)
except:
print(json_file_name)
return
for row in json_data['rows']:
def scrapy_by_row(row):
try:
orgid = row['organization']['id']
familyid = row['censusRegisterFamily']['id']
except:
print('errrr')
return
scrapy_by_row(row)
遍历文件夹
遍历目录(rootdir) 遍历到的每个文件都执行dirFunc
def waklThroughDir(rootdir, dirFunc):
for parent, dirnames, filenames in os.walk(rootdir):
for filename in filenames:
print(filename)
#获取后缀为txt的文件
if(filename.split('.')[-1] == 'html'):
dirFunc(os.path.join(parent, filename))
##采集温州房产网基本信息
# -*- coding: utf-8 -*-
import re
import requests
import time
#-----------------------------用于解析的正则表达式常量------------------------------------------------------------------
#解析页数
PAGE_NUM = '共找到 (.*?) 符合条件的记录'
#解析小区名称
NAME = 'texttext_title"><ahref(.*?)</a></div><divclass="texttext_moreinfo">'
#解析小区价格
PRICE = 'class="hot_price">(.*?)</span>'
#解析小区地址
ADDRESS = 'text_moreinfo">(.*?)</div><divclass="texttext_moreinfo"><span>'
#文件生成路径
ROOTDIR = 'F:\\test\\'
#-----------------------------模拟请求的头部信息,否则将被识别出是程序抓包而被拦截--------------------------------------
HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36',
'Host': 'www.0577home.net',
'Upgrade-Insecure-Requests': '1'
}
#-----------------------------抓取某一页的房产信息,pageNo为页号--------------------------------------------------------
def getHouseListByPageno(pageNo):
#建立一个连接用于后续发起请求
session = requests.session()
url = 'http://www.0577home.net/xiaoqu/list_0_0_0_0_0_0_0_' + str(pageNo) + '.html'
houseList = session.get(url, headers = HEADERS, verify = False)
#以写入模式打开文件
fh = open(ROOTDIR + "houseList_pageNo" + str(pageNo) + ".txt", 'w' ,encoding='utf-8')
#将movieList写入文件
fh.write(houseList.text)
#关闭文件
fh.close()
#-------------------------------获取需要抓取的页面总数------------------------------------------------------------------
def getPageNum():
#打开已经下载好的第一页房产内容
f = open(ROOTDIR + 'houseList_pageNo1.txt', encoding='utf-8')
#获取文件内容
rawContent = f.read()
#用正则表达式解析页面内容
pageNum = re.findall(PAGE_NUM, rawContent)
#返回页面号
return int(pageNum[0]) / 20 + 1
def parseHouseListToFile(srcFile, dstFile):
#打开待解析的文件
f = open(srcFile, encoding='utf-8')
#读取文件内容以备解析
rawContent = f.read()
p = re.compile('\s+')
content = re.sub(p, '', rawContent)
dnames = re.findall(NAME, content)
names = []
for dname in dnames:
idx = dname.rfind('>')
names.append(dname[idx + 1:])
prices = re.findall(PRICE, content)
daddress = re.findall(ADDRESS, content)
address = []
for daddr in daddress:
id = daddr.rfind('>')
address.append(daddr[id + 1:])
i = 0
for x in names:
#写入时用'$'做分割,结尾加上回车符
dstFile.write(names[i] + '$' + prices[i] + '$' + address[i] + '\n')
i = i + 1
# -------------------------------主函数,下载并解析房产信息--------------------------------------------------------------
if __name__ == '__main__':
#---------------------抓取页面-----------------------------
#抓取第一页房产信息
getHouseListByPageno(1)
#通过第一页房产信息获取总共要抓取的页面数量
pageNum = getPageNum()
#抓取剩余的页面
for i in range(2, int(pageNum) + 1):
getHouseListByPageno(str(i))
#---------------------解析页面-----------------------------
#获取当前年月日
localtime = time.strftime('%Y%m%d', time.localtime(time.time()))
#创建一个文件,文件名前面带上年月日
f = open(ROOTDIR + localtime + '_houseList.txt', 'a+', encoding='utf-8')
#解析所有的页面
#for k in range(1, int(pageNum) + 1):
for k in range(1, 115):
parseHouseListToFile(ROOTDIR + "houseList_pageNo" + str(k) + ".txt", f)
#关闭文件
f.close()
f = open(ROOTDIR + localtime + '_houseList.txt', encoding='utf-8')
fd = open(ROOTDIR + localtime + '_houseInfo.txt', 'w', encoding='utf-8')
k = 0
for line in f:
data = line.strip('\n')
data = data.split('$')
idx = data[3]
getHouseInfoByPageno(idx, k)
houseInfo = parseHouseInfo(ROOTDIR + "houseInfo_pageNo" + str(idx) + ".html")
print(str(k) + "$".join(data) + '$' + "$".join(houseInfo))
fd.write("$".join(data) + '$' + "$".join(houseInfo) + '\n')
k += 1
f.close()
fd.close()
调用java
import sys
import jpype
name = sys.argv[1]
jarpath = '/home/dsadm/why/python'
jpype.startJVM(jpype.getDefaultJVMPath(), "-Djava.ext.dirs=%s" % jarpath)
DECRYPT = jpype.JClass('why.fmrt.decrypt.DECRYPT')
upperName =DECRYPT.decrypt(name)
print(upperName)
jpype.shutdownJVM()
构建 web 页面
import os
import tornado.httpserver
import tornado.ioloop
import tornado.options
import tornado.web
from view import *
from tornado.options import define, options
define("port", default=8000, help="run on the given port", type=int)
class Application(tornado.web.Application):
def __init__(self):
handlers = [
(r"/", Indexhandler),
]
settings = dict(
template_path=os.path.join(os.path.dirname(__file__), 'templates'),
autoescape=None,
debug=False,
)
tornado.web.Application.__init__(self, handlers, **settings)
if __name__ == "__main__":
tornado.options.parse_command_line()
http_server = tornado.httpserver.HTTPServer(Application(), xheaders=True)
http_server.listen(options.port)
tornado.ioloop.IOLoop.instance().start()
多进程
import multiprocessing
for process_id in range(PROCESS_NUM):
p = multiprocessing.Process(target=worker, args=(process_id,))
jobs.append(p)
p.start()
多线程
# -*- coding: utf-8 -*-
import threading
class Threadconfig():
def __init__(self, thread_size):
self.thread_size = thread_size
def topen(self):
self.thread_tasks = []
def build(self, func, **kwargs):
self.thread_task = threading.Thread(target=func, kwargs=(kwargs))
self.thread_tasks.append(self.thread_task)
def run(self):
for thread_task in self.thread_tasks:
thread_task.setDaemon(True)
thread_task.start()
while 1:
alive = False
for thread_num in range(0, self.thread_size):
alive = alive or self.thread_tasks[thread_num].isAlive()
if not alive:
break
def __del__(self):
self.thread_tasks = []
移除中文分隔符号
cmd = "sed ':a;N;$ s/\\r\\n//g;ba' " + oldfile + " > " + newfile
os.system(cmd)