|
本帖最后由 3mile 于 2019-10-23 13:51 编辑
从tianmaotv获取,代码开源。各位自由发挥
首次使用python
设置了代码回复可见,只为统计下载量
论坛的hide代码有问题的,代码框140强行加入了【hide】。各位自行删除【hide】吧。
修改成下图的样子
代码如下- import sys
- import requests
- import re
- import time
- import codecs
- import json
- import xml.etree.ElementTree as ET
- from datetime import datetime, date, time, timedelta
- from bs4 import BeautifulSoup
- from xml.dom import minidom
- from xpinyin import Pinyin
- if sys.version_info[0] == 2:
- from urllib2 import urlopen # Python 2
- else:
- from urllib.request import urlopen # Python3
- def get_week_backup():
- now=datetime.now()
- week=now.strftime('%w')
- if int(week) == 7:
- w=[week,str(1)]
- else:
- w=[week,str(int(week)+1)]
- return w
- def get_week():
- now=datetime.now()
- week=now.strftime('%w')
- wd=int(week)
- w=[]
- for i in range(wd,8):
- w.append(str(i))
- return w
- def saveXML(root, filename, indent="\t", newl="\n", encoding="utf-8"):
- rawText = ET.tostring(root)
- dom = minidom.parseString(rawText)
- with codecs.open(filename, 'w', 'utf-8') as f:
- #writer = codecs.lookup('utf-8')[3](f)
- dom.writexml(f, "", indent, newl, encoding)
- def ch_dict():
- headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0',
- 'Connection': 'keep-alive', 'Cache-Control': 'no-cache'}
- url="https://www.tianmaotv.cn/tv/"
- ch_list=["type_101","type_102","tv_453","tv_526"]
- dict_url={}
- dict_name={}
- p=Pinyin()
- tmp_dic={}
- for ch in ch_list:
- r = requests.get(url+ch, headers=headers)
- r.encoding=r.apparent_encoding
- soup = BeautifulSoup(r.text, 'lxml')
-
- list_program_div = soup.find(
- name='dl', attrs={"class": "tv-station-list"}).find_all('a')
-
- for tagprogram in list_program_div:
- r=re.compile(r"\/jmb")
- if r.match(tagprogram["href"]):
- dict_url[tagprogram.string]=tagprogram["href"]
- tmp=p.get_initials(tagprogram.string, u'')
- #flages=tmp is tmp_dic
- if tmp in tmp_dic:
- tmp_dic[tmp+"1"]=tagprogram.string
- else:
- tmp_dic[tmp]=tagprogram.string
- '''
- name_list_temp=list(dict_name.values())
- flags=tagprogram.string in dict_name
- if not flags:
- dict_name[tagprogram.string]=p.get_initials(tagprogram.string, u'')
- for ss in name_list_temp:
- if tmp == ss:
- print(tmp,tmp+"1",tagprogram.string)
- dict_name[tagprogram.string]=p.get_initials(tagprogram.string, u'')+"1"
- #print(dict_name)'''
- dict_name={v:k for k, v in tmp_dic.items()}
- return dict_url,dict_name
- def get_channel_programme(url_name_py,real_rul):
- headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0',
- 'Connection': 'keep-alive', 'Cache-Control': 'no-cache'}
-
- r = requests.get("https://www.tianmaotv.cn"+real_rul, headers=headers)
- r.encoding=r.apparent_encoding
- soup = BeautifulSoup(r.text, 'lxml')
- #print(soup)
- timestart={}
- timestop={}
- channel_title={}
- list_program_div = soup.find(name='dl', attrs={"class": "program-list"}).find_all('dd')
- #print(list_program_div)
- for i in list_program_div:
- times=i.time.string.strip()
- times=datetime.strptime("2019-"+times,'%Y-%m-%d %H:%M')
- timestart[times.strftime("%Y%m%d%H%M%S")]=i.contents[2].strip()
- return timestart
- aa=ch_dict()
- channel_url=aa[0]
- channel_py=aa[1]
- '''
- with open('e:\\python\\tianmaotv\\data.json', 'w',encoding='utf-8') as f:
- json.dump(aa, f,sort_keys=False, indent=4, separators=(',',':'),ensure_ascii=False)
- print(channel_py.items)
- print(channel_url.items)
- '''
- new_root = ET.Element('tv')
- new_root.set("generator-info-name", "3mile")
- new_root.set("generator-info-url", "https://3mile.top")
- new_root.tail="\n"
- for k,v in channel_py.items():
- child = ET.SubElement(new_root,"channel")
- #child.tail="\n"
- child.set("id",v)
- child_name=ET.SubElement(child,"display-name")
- #child.tail="\n"
- child_name.set("lang","zh")
- child_name.text=k
- child_url=ET.SubElement(child,'url')
- #child_url.tail='\n'
- child_url.text="https://tianmao.tv"+channel_url[k]
- #ET.dump(new_root)
- wday=get_week()
- for k,v in channel_url.items():
- programme={}
- print("正在获取 "+k+" EPG信息")
- for i in wday:
- url=re.sub(r'\/, "", v)+"_w"+i+"/"
- progr=get_channel_programme(channel_py[k],url)
- programme.update(progr)
-
- tmp=list(programme.items())
- list_time=list(programme.keys())
- list_title=list(programme.values())
- for t in range(len(list_time)):
- tit=list_title[t]
- begin=list_time[t]
- if t+1<len(list_time):
- end=list_time[t+1]
- else:
- ta1=datetime.strptime(list_time[t],'%Y%m%d%H%M%S')
- ta1=ta1+timedelta(hours=1)
- end=ta1.strftime('%Y%m%d%H%M%S')
- programme_sub=ET.SubElement(new_root,"programme")
- #programme_sub.tail='\n'
- programme_sub.set("start",begin+" +0800")
- programme_sub.set("stop",end+" +0800")
- programme_sub.set("channel",channel_py[k])
-
- programme_title=ET.SubElement(programme_sub,"title")
- programme_title.set("lang","zh")
- programme_title.text=tit
- print("已经获取 "+k+"EPG内容")
- saveXML(new_root,"e.xml")
复制代码
|
本帖子中包含更多资源
您需要 登录 才可以下载或查看,没有账号?立即注册
×
|