Python批量下载Landsat-8数据(II)
程序员文章站
2022-07-12 23:52:15
...
接上一篇Python批量下载Landsat-8数据(I),这次从Google Storage下载数据。
可参考Google Storage公开数据集介绍
1. 获取影像条带号
参考Python批量下载Landsat-8数据(I),不再赘述。
2. 根据条带号获取文件信息及下载地址
Google Storage的目录索引有500M,解压后有3G,直接读取进行检索,需要花费80s,为便于检索,建了一个简单的数据库:
import mysql.connector
#连接数据库
mydb = mysql.connector.connect(
host="localhost", # 数据库主机地址
user="root", # 数据库用户名
passwd="", # 数据库密码
)
mycursor = mydb.cursor()
mycursor.execute("CREATE DATABASE landsat_index") #创建数据库
生成表
mycursor.execute("CREATE TABLE gc_index( sid INT UNSIGNED AUTO_INCREMENT, \
SCENE_ID VARCHAR(30),\
PRODUCT_ID VARCHAR(50),\
SPACECRAFT_ID VARCHAR(10),\
SENSOR_ID VARCHAR(10),\
DATE_ACQUIRED VARCHAR(10),\
COLLECTION_NUMBER VARCHAR(10),\
COLLECTION_CATEGORY VARCHAR(10),\
SENSING_TIME VARCHAR(30),\
DATA_TYPE VARCHAR(10),\
WRS_PATH INT,\
WRS_ROW INT,\
CLOUD_COVER FLOAT,\
NORTH_LAT FLOAT,\
SOUTH_LAT FLOAT,\
WEST_LON FLOAT,\
EAST_LON FLOAT,\
TOTAL_SIZE INT,\
BASE_URL VARCHAR(150),\
PRIMARY KEY ( sid ))DEFAULT CHARSET=utf8")
# 查看表
mycursor.execute("SHOW TABLES")
for x in mycursor:
print(x)
# 查看描述
mycursor.execute("desc gc_index")
for x in mycursor:
print(x)
插入数据
sql = "INSERT INTO gc_index (SCENE_ID,PRODUCT_ID,SPACECRAFT_ID,SENSOR_ID,DATE_ACQUIRED,COLLECTION_NUMBER,COLLECTION_CATEGORY,SENSING_TIME,DATA_TYPE,WRS_PATH,WRS_ROW,CLOUD_COVER,NORTH_LAT,SOUTH_LAT,WEST_LON,EAST_LON,TOTAL_SIZE,BASE_URL)\
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
with open ('index.csv','r') as f:
for line in f:
line = line.replace('\n','')
val = tuple(line.split(','))
print(val)
mycursor.execute(sql, val)
mydb.commit()
检索数据,并保存下载链接:
Google Storage的下载有两种,样例url分别为
https://storage.cloud.google.com/gcp-public-data-landsat/LC08/01/001/004/LC08_L1GT_001004_20130910_20170502_01_T2/LC08_L1GT_001004_20130910_20170502_01_T2_B1.TIF
http://storage.googleapis.com/gcp-public-data-landsat/LC08/01/001/004/LC08_L1GT_001004_20130910_20170502_01_T2/LC08_L1GT_001004_20130910_20170502_01_T2_B1.TIF
其中googleapi无需*,主要保存成该样式下载链接。
import mysql.connector
import time
import os
CLOUD_MAX=20 #云量
PATH = 118 #行号
ROW = 39 #列号
SPACECRAFT = 'LANDSAT_8' #
INFO = 'SCENE_ID, PRODUCT_ID,CLOUD_COVER,TOTAL_SIZE,BASE_URL' # 保存的信息:产品ID,云量,大小,链接
BASE_URL = 'http://storage.googleapis.com/'
mydb = mysql.connector.connect(
host="localhost", # 数据库主机地址
user="root", # 数据库用户名
passwd="", # 数据库密码
database = 'landsat_index',#连接数据库
)
mycursor = mydb.cursor() #创建指针
# 生成新文件夹
file_path = '{}{:0>3d}'.format(PATH,ROW)
base_path = os.getcwd()
entity_dir = os.path.join(base_path, file_path)
os.makedirs(entity_dir, exist_ok=True)
os.chdir(entity_dir)
print('Retriving {}{:0>3d}'.format(PATH,ROW))
time_start = time.time()
sql = "SELECT {} FROM gc_index WHERE PRODUCT_ID !='' AND SPACECRAFT_ID = '{}' AND WRS_PATH={} AND WRS_ROW = {} AND CLOUD_COVER<={}".format(INFO,SPACECRAFT,PATH,ROW,CLOUD_MAX)
mycursor.execute(sql)
myresult = mycursor.fetchall()
time_end = time.time()
print('Time cost: ',time_end-time_start)
myresult.sort()
print("Data Num: ",len(myresult))
# for result in myresult:
# print(result)
# 保存检索到的影像信息
file_result = 'gs{}{:0>3d}_result.csv'.format(PATH,ROW)
with open(file_result,'w') as f:
f.write(INFO+'\n')
for result in myresult:
f.write(','.join(str(i) for i in result) + '\n')
# 保存下载链接
file_url = 'gs{}{:0>3d}_url.txt'.format(PATH,ROW)
down_url={}
for result in myresult:
url_list=[]
EntityID = result[1]
url_middle = result[4].split('//')[-1]
url0 = '{}{}/{}_ANG.txt'.format(BASE_URL,url_middle,EntityID)
url1 = '{}{}/{}_B1.TIF'.format(BASE_URL,url_middle,EntityID)
url2 = '{}{}/{}_B2.TIF'.format(BASE_URL,url_middle,EntityID)
url3 = '{}{}/{}_B3.TIF'.format(BASE_URL,url_middle,EntityID)
url4 = '{}{}/{}_B4.TIF'.format(BASE_URL,url_middle,EntityID)
url5 = '{}{}/{}_B5.TIF'.format(BASE_URL,url_middle,EntityID)
url6 = '{}{}/{}_B6.TIF'.format(BASE_URL,url_middle,EntityID)
url7 = '{}{}/{}_B7.TIF'.format(BASE_URL,url_middle,EntityID)
url8 = '{}{}/{}_B8.TIF'.format(BASE_URL,url_middle,EntityID)
url9 = '{}{}/{}_B9.TIF'.format(BASE_URL,url_middle,EntityID)
url10= '{}{}/{}_B10.TIF'.format(BASE_URL,url_middle,EntityID)
url11= '{}{}/{}_B11.TIF'.format(BASE_URL,url_middle,EntityID)
url12= '{}{}/{}_BQA.TIF'.format(BASE_URL,url_middle,EntityID)
url13= '{}{}/{}_MTL.txt'.format(BASE_URL,url_middle,EntityID)
url_list = [url0,url1,url2,url3,url4,url5,url6,url7,url8,url9,url10,url11,url12,url13]
down_url[EntityID] = url_list
with open(file_url,'w') as f:
f.write(str(down_url))
3下载
术业有专攻,不再在python中下载,在python中调用IDM,批量导入任务进行下载。
import wget,os
from subprocess import call
IDM = r'D:\Program Files\IDM\IDMan.exe'
file_path = './118039/gs118039_url.txt'
base_path = os.path.dirname(os.path.abspath(file_path))
with open(file_path,'r') as f:
file = f.read()
file_list = eval(file)
for key in file_list.keys():
entity_dir = os.path.join(base_path, key)
os.makedirs(entity_dir, exist_ok=True)
os.chdir(entity_dir)
#print(os.getcwd())
value = file_list[key]
for url in value:
name = url.split('/')[-1]
if os.path.exists(name):
print('\nDownloaded: ',name)
continue
print('\nDownloading: ',name)
try:
#wget.download(url)
call([IDM,'/d',url,'/p',entity_dir,'/f',name,'/n','/a'])
except:
continue
IDM中Amazon S3的下载速度大概在500K/s左右,Google Storage的下载速度在1M/s(垃圾网速,跑满1M/s(百度云开会员也是这个速度))。
感谢Google Storage,Google Storage公开数据集出了LANDSAT之外,还有哨兵二号,之后再研究。不知道百度云什么时候能出百度云公开数据集。
下一篇: GEE中的Landsat数据