• Home
  • About
    • Kira's Blog Site

      Doctor X's blog site

    • Learn More
    • Email
    • Twitter
    • Github
    • Steam
    • Weibo
  • Posts
    • All Posts
    • All Tags
  • Data Analysis
  • Climate Projects
  • Videos
  • Life
  • Notes
  • Algorithm
  • Papers

First to Crawl Funds Data

22 Feb 2020

  • 首先这个不是我写的,我只是在熟悉的过程中做个笔记,顺便写在自己的网站上
  • 当你了解beautifusoup或scrapy的话,建议采用这两种,原因:
    1. 代码量比较少(很有可能)
    2. 爬取的框架比较完备,参数设置什么的都比较成熟
    3. 现有的框架你不要非要自己编,你是不是傻啊
  • 说归说,个人是想从底层了解的。咳咳咳……

借鉴

根据河里的肥鱼的文章进行学习 首先送天天基金开始出发,作者选取这个网站的原因是由于这个网站的基金比较全。 随意选取银河创新成长混合(519674)

需要的python包:

import requests
import time
import execjs
import matplotlib.pyplot as plt # 用于画图
import numpy as np

接口构造

def getUrl(fscode):
    head = 'http://fund.eastmoney.com/pingzhongdata/'
    tail = '.js?v='+ time.strftime("%Y%m%d%H%M%S",time.localtime())
    return head+fscode+tail

获取数据

def getWorth(fscode):
    #用requests获取到对应的文件
    content = requests.get(getUrl(fscode))
    
   #使用execjs获取到相应的数据
    jsContent = execjs.compile(content.text)
    name = jsContent.eval('fS_name')
    code = jsContent.eval('fS_code')
    #单位净值走势
    netWorthTrend = jsContent.eval('Data_netWorthTrend')
    #累计净值走势
    ACWorthTrend = jsContent.eval('Data_ACWorthTrend')

    netWorth = []
    ACWorth = []

    #提取出里面的净值
    for dayWorth in netWorthTrend[::-1]:
        netWorth.append(dayWorth['y'])

    for dayACWorth in ACWorthTrend[::-1]:
        ACWorth.append(dayACWorth[1])
    print(name,code)
    return netWorth, ACWorth

查看数据

netWorth, ACWorth = getWorth('519674')
print(netWorth)
len(netWorth)

>>> 银河创新成长混合 519674
[6.1423, 5.9585, 5.747, 5.8071, 5.6002, 5.4243, 5.3753, 5.3354, 5.2235, 5.1426, 5.2218, 5.0837, 4.9414, 4.8895,
...
 0.9993, 1, 1]
2206

问题:

  • 我们如果分析最近几个周、几个月的数据,其实也可以不需要了解具体某一天的数据,取最近20天、40天等方式即可。当然,也可以从当天开始逆推回去,给每个净值标上日期,不过这个需要忽略节假日,处理起来比较麻烦且必要性不大,我就没有做这个处理。
  • 目前想到的解决办法是采用pip install chinesecalendar
>>> import datetime
>>> from chinese_calendar import is_workday
>>> da  =  datetime.date(2019,1,2)
>>> boll = is_workday(da)
>>> print(boll)
True
  • 然后对每天进行判断然后blahblah
    • 随便你是不全原来的数据
    • 还是剔除掉日期。

具体实现就看各位自己的喜好了。这里仅仅给出最简单对折线图:

import matplotlib.pyplot as plt
import numpy as np
plt.style.use('ggplot')

%matplotlib inline
fig, ax1 = plt.subplots()

x_ = 60
x = np.arange(1,x_ + 1,1)

V1 = netWorth[:x_][::-1]
ax1.plot(x, V1, linewidth=2, color='#986DB2', label='Net Worth Trend')
# ax1.fill_between(x, V1, min(V1), alpha=0.3)
ax1.set_xlim(x[0], x[-1])
ax1.set_ylim(np.min(V1), np.max(V1))
ax1.tick_params(axis='y', labelcolor='#986DB2')
ax2.grid(True)
leg = plt.legend(loc='upper left')

ax2 = ax1.twinx() 
V2 = (np.array(netWorth[:-1]) - np.array(netWorth[1:]))[:x_][::-1]
ax2.plot(x, V2, linewidth=1, color='#7DB9DE', label='incremental')
ax2.fill_between(x=x, y1=0, y2=V2, color='#7DB9DE', alpha=0.5)
# ax1.xlim(x[0], x[-1])
ax2.set_ylim(np.min(V2), np.max(V2))
ax2.tick_params(axis='y', labelcolor='#7DB9DE')

leg = plt.legend(loc='upper right')
ax2.grid(False)
plt.tight_layout()


plt.show()

获取所有数据并保存

import requests
import time
import execjs

def getUrl(fscode):
    head = 'http://fund.eastmoney.com/pingzhongdata/'
    tail = '.js?v='+ time.strftime("%Y%m%d%H%M%S",time.localtime())
  
    return head+fscode+tail

# 根据基金代码获取净值
def getWorth(fscode):
    content = requests.get(getUrl(fscode))
    jsContent = execjs.compile(content.text)
    
    name = jsContent.eval('fS_name')
    code = jsContent.eval('fS_code')
    #单位净值走势
    netWorthTrend = jsContent.eval('Data_netWorthTrend')
    #累计净值走势
    ACWorthTrend = jsContent.eval('Data_ACWorthTrend')

    netWorth = []
    ACWorth = []

    for dayWorth in netWorthTrend[::-1]:
        netWorth.append(dayWorth['y'])

    for dayACWorth in ACWorthTrend[::-1]:
        ACWorth.append(dayACWorth[1])
    print(name,code)
    return netWorth, ACWorth
  
def getAllCode():
    url = 'http://fund.eastmoney.com/js/fundcode_search.js'
    content = requests.get(url)
    jsContent = execjs.compile(content.text)
    rawData = jsContent.eval('r')
    allCode = []
    for code in rawData:
        allCode.append(code[0])
    return allCode

allCode = getAllCode()



netWorthFile = open('./netWorth.csv','w')
ACWorthFile = open('./ACWorth.csv','w')

for code in allCode:
    try:
        netWorth, ACWorth = getWorth(code)
    except:
        continue
    if len(netWorth)<=0 or len(ACWorth)<0:
        print(code+"'s' data is empty.")
        continue
    netWorthFile.write("\'"+code+"\',")  
    netWorthFile.write(",".join(list(map(str, netWorth))))
    netWorthFile.write("\n")
  
    ACWorthFile.write("\'"+code+"\',")  
    ACWorthFile.write(",".join(list(map(str, ACWorth))))
    ACWorthFile.write("\n")
    print("write "+code+"'s data success.")
  

netWorthFile.close()
ACWorthFile.close()



Like Tweet +1