Monday, May 20, 2013

Connecting to gmail and downloading all mails in .mbox file format on local hard drive using python is so easy.Try it



Note that this script marks all mails in your mailboxes as read so be careful. Also I am not responsible for any data loses or corruption.Save this with some filename with .py extension and run it from command line as filename.py. Also this is written for 3.x python version. 

 

import imaplib,re,mailbox
from email.parser import BytesParser
class mbox:
def __init__(self,filename):

filename=filename.split('/')[-1]+".mbox"

#print(filename)
self.file_handle=mailbox.mbox((filename))

def dump_mails(self,msg_list):
try:
self.file_handle.lock()

for each_msg in msg_list:
mbox_msg=mailbox.mboxMessage(each_msg)
self.file_handle.add(mbox_msg)

self.file_handle.flush()
finally:
self.file_handle.unlock()

def close(self):
self.file_handle.close()



class gmail:
def __init__(self):
self.IMAP_SERVER='imap.gmail.com'
self.IMAP_PORT=993
self.M = None
self.response=None
self.mailboxes=[]

def connect(self,username,password):
self.M=imaplib.IMAP4_SSL(self.IMAP_SERVER,self.IMAP_PORT)
status,self.response=self.M.login(username,password)
return status

def get_mailboxes(self):
rc,self.response=self.M.list()
#pattern=r'"([[a-zA-Z0-9]+]/)*[a-zA-Z0-9 ]+"'
pattern=r'".*?"'
pattern=re.compile(pattern)

for item in self.response:
item=item.decode("ascii")
folder_subpath_list=pattern.findall(item)
self.mailboxes.append(folder_subpath_list[-1][1:-1])
#print("self.mailbox",self.mailboxes)
return self.mailboxes

def get_mailcount(self,mailbox='inbox'):
rc,self.response=self.M.select(mailbox)
self.mailcount=int(self.response[0])
return self.mailcount

def get_unread_mailcount(self,mailbox='inbox'):
rc,message=self.M.status(mailbox,"(UNSEEN)")
for item in message:
print("message item= ",item)
unreadCount = re.search("UNSEEN (\d+)", str(message[0])).group(1)
return unreadCount

def rename_mailbox(self, oldmailbox, newmailbox):
rc, self.response = self.M.rename(oldmailbox, newmailbox)
return rc

def create_mailbox(self, mailbox):
rc,self.response= self.M.create(mailbox)
return rc

def delete_mailbox(self, mailbox):
rc, self.response = self.M.delete(mailbox)
return rc

#retrieves all mails and probably marks them as read so be carefull
def get_all_mails(self,mailfolder):

#select specified mailfolder at server
rc,self.response=self.M.select(mailbox=mailfolder)

#get mail numbers list
status,data=self.M.search(None,'ALL')

mail_obj_list=[]
for mail_no in data[0].split():
status,mail=self.M.fetch(mail_no,'(RFC822)')
mail_obj=BytesParser().parsebytes((mail[0][1]))
mail_obj_list.append(mail_obj)
return mail_obj_list

def disconnect(self):
self.M.logout()


#main script starts


gmail_link=gmail()
username=input('Enter username:').strip()
password=getpass.getpass()
print ("connecting...")
print (gmail_link.connect(username,password),gmail_link.response)

print ("Retrieving mailboxes...")
mailboxes=gmail_link.get_mailboxes()
print ("Done...")


for mailbox in mailboxes:
try:
if mailbox.find(' ')!=-1:
continue
print ("Processing mailbox:",mailbox)

#get all mails_list present in mailbox
mails=gmail_link.get_all_mails(mailfolder=mailbox)

#create new mailbox(type =mbox) for each folder(mailbox)
mailbox_handle=mbox(filename=mailbox)

#add mails to mailbox created
mailbox_handle.dump_mails(mails)
print ("Processing mailbox:",mailbox,"done")
except:
print("Could not retrieve from mailbox:",mailbox)
finally:
mailbox_handle.close()
print("Disconnecting...")
gmail_link.disconnect()