#!/usr/bin/env python
# -*- coding: utf-8 -*-
#to ensure the utf8 encoding environment
import sys
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
reload(sys)
sys.setdefaultencoding(default_encoding)
import re
import urllib,urllib2,Cookie
from google.appengine.api import urlfetch
from google.appengine.ext import db
class Twitter(db.Model):
id=db.StringProperty()
def make_cookie_header(cookie):
ret = ""
for val in cookie.values():
ret+="%s=%s; "%(val.key, val.value)
return ret
def send_sina_msgs(username,password,msgs):
'''send sina msgs. use sina username, password.
the msgs parameter is a message list, not a single string.
'''
result = urlfetch.fetch(url="https://login.sina.com.cn/sso/login.php?username=%s&password=%s&returntype=TEXT"%(username,password))
cookie = Cookie.SimpleCookie(result.headers.get('set-cookie', ''))
for msg in msgs:
form_fields = {
"content": msg,
}
form_data = urllib.urlencode(form_fields)
result = urlfetch.fetch(url="http://t.sina.com.cn/mblog/publish.php",
payload=form_data,
method=urlfetch.POST,
headers={'Referer':'http://t.sina.com.cn','Cookie' : make_cookie_header(cookie)})
#print ""
#print result.content
def unescape(text):
"""Removes HTML or XML character references
and entities from a text string.
keep &, >, < in the source code.
from Fredrik Lundh
http://effbot.org/zone/re-sub.htm#unescape-html
"""
def fixup(m):
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
print "erreur de valeur"
pass
else:
# named entity
try:
if text[1:-1] == "amp":
text = "&amp;"
elif text[1:-1] == "gt":
text = "&gt;"
elif text[1:-1] == "lt":
text = "&lt;"
else:
print text[1:-1]
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
print "keyerror"
pass
return text # leave as is
return re.sub("&#?\w+;", fixup, text)
#get one page of to user's replies, 20 messages at most.
def parseTwitter(twitter_id,sina_id,sina_sn,since_id="",):
if since_id:
url="http://twitter.com/statuses/user_timeline/%s.xml?since_id=%s"%(twitter_id,since_id)
else:
url="http://twitter.com/statuses/user_timeline/%s.xml"%(twitter_id)
#print url
result = urlfetch.fetch(url)
#print result.content
if result.status_code == 200:
content=result.content
m= re.findall(r"(?i)<id>([^<]+)</id>\s*<text>(?!@)([^<]+)</text>", content)
s=[] #s is a list for messages
for x in reversed(m):
id=x[0]
text=unescape(x[1])
s.append(text)
#for m in s:
# print m
#return
try:
send_sina_msgs(sina_id,sina_sn,s)
msg=Twitter()
msg.id=id
msg.put()
except:
print "send sina messages error"
else:
print "get twitter data error"
def getLatest():
msg=db.GqlQuery("SELECT * FROM Twitter ORDER BY id DESC")
x=msg.count()
if x:
return msg[0].id
else:
return ""
print ""
latest=getLatest()
#modify here!!!
parseTwitter(twitter_id="haitai",sina_id="hity",sina_sn="******",since_id=latest)