简单分析C之Curl模块同php的curl和python的pycurl模块的关系

缘起:以前一直喜欢用scrapy做爬虫,并且实践效果也很好,后来由于单位让自己写一套分布式爬虫(python实现),替代公司原有的爬虫(php实现),大致用于实践后,发现效果是比原来的效果好,原来能做配置的网站20个里能配置10个,现在20个里能配置16个,分析原因,是架构设计方面有那么一点点扩充性,在大致架构不变的基础上,可进行有限的扩展,而其实实现的原理都是通过CURL来实现的。

php的curl,是在php发布程序的ext文件中,作为一个php自带的支持,需要改写php的配置文件,修改php.ini,将;extension=php_curl.dll前的分号去掉。

python的pycurl,不是python自带的支持程序,python在做爬虫一般都是用urllib,urllib2,twisted等,比较少的使用pycurl.安装略.

c的curl,是前面2个语言的curl父程序,是c的curl才有了php的curl和python的pycurl,同时,python的pycurl文档说明了只实现了部分功能,即是一个c的curl的阉割版。泪奔,原来用了那么长时间的东西,连冰山一角都没触碰,或者python的pycurl也只是会用其中的一个或少数几个功能。

如何用:

C的curl:

复制代码
#include <stdio.h>
#include <curl/curl.h>
int main(void)
{
  CURL *curl;
  CURLcode res;
  curl = curl_easy_init();
  if(curl) {
    /* First set the URL that is about to receive our POST. This URL can
       just as well be a https:// URL if that is what should receive the
       data. */
    curl_easy_setopt(curl, CURLOPT_URL, "http://postit.example.com/moo.cgi");
    /* Now specify the POST data */
    curl_easy_setopt(curl, CURLOPT_POSTFIELDS, "name=daniel&project=curl");
    /* Perform the request, res will get the return code */
    res = curl_easy_perform(curl);
    /* always cleanup */
    curl_easy_cleanup(curl);
  }
  return 0;
}
复制代码

php的curl:

复制代码
<?php
$c = curl_init();
curl_setopt($c, CURLOPT_URL, 'http://www.baidu.com');
$data = curl_exec($c);
curl_close($c);
echo $c;
?>
复制代码

python的pycurl:

复制代码
import pycurl
def body(buffer):
    print buffer
c = pycurl.Curl()
c.setopt(pycurl.URL, "http://www.baidu.com/")
c.setopt(pycurl.WRITEFUNCTION, body)
c.perform()
复制代码

主要原理:

C:

使用到的数据结构:

复制代码
typedef void CURL;  /*当初始化什么的时候只是一个void类型*/
struct SessionHandle {
  struct Names dns;
  struct Curl_multi *multi;    /* 用于多线程处理*/
  struct Curl_one_easy *multi_pos; /* if non-NULL, points to the its position
                                      in multi controlling structure to assist
                                      in removal. */
  struct Curl_share *share;    /* Share, handles global variable mutexing */
  struct HandleData reqdata;   /* Request-specific data */
  struct UserDefined set;      /* values set by the libcurl user ,用于setopt等*/
  struct DynamicStatic change; /* possibly modified userdefined data */
  struct CookieInfo *cookies;  /* the cookies, read from files and servers */
  struct Progress progress;    /* for all the progress meter data */
  struct UrlState state;       /* struct for fields used for state info and
                                  other dynamic purposes */
  struct PureInfo info;        /* stats, reports and info data */
#if defined(CURL_DOES_CONVERSIONS) && defined(HAVE_ICONV)
  iconv_t outbound_cd;         /* for translating to the network encoding */
  iconv_t inbound_cd;          /* for translating from the network encoding */
  iconv_t utf8_cd;             /* for translating to UTF8 */
#endif /* CURL_DOES_CONVERSIONS && HAVE_ICONV */
  unsigned int magic;          /* set to a CURLEASY_MAGIC_NUMBER */
};
struct UserDefined {
  FILE *err;         /* the stderr user data goes here */
  void *debugdata;   /* the data that will be passed to fdebug */
  char *errorbuffer; /* (Static) store failure messages in here */
  long proxyport; /* If non-zero, use this port number by default. If the
                     proxy string features a ":[port]" that one will override
                     this. */
 /**一下省略10000行- -**/
};
复制代码

使用的方法1:

复制代码
1.初始化curl,得到sessionhandler结构体空间
CURL *curl_easy_init(void)
{
  CURLcode res;
  struct SessionHandle *data;
  /* Make sure we inited the global SSL stuff */
  if (!initialized) {
    res = curl_global_init(CURL_GLOBAL_DEFAULT);
    if(res) {
      /* something in the global init failed, return nothing */
      DEBUGF(fprintf(stderr, "Error: curl_global_init failed\n"));
      return NULL;
    }
  }
  /* We use curl_open() with undefined URL so far */
  res = Curl_open(&data);
  if(res != CURLE_OK) {
    DEBUGF(fprintf(stderr, "Error: Curl_open failed\n"));
    return NULL;
  }
  return data;
}
复制代码

方法2.

复制代码
设置参数:
CURLcode curl_easy_setopt(CURL *curl, CURLoption tag, ...)
{
  va_list arg;
  struct SessionHandle *data = curl;
  CURLcode ret;
  if(!curl)
    return CURLE_BAD_FUNCTION_ARGUMENT;
  va_start(arg, tag);
  ret = Curl_setopt(data, tag, arg);
  va_end(arg);
  return ret;
}
CURLcode Curl_setopt(struct SessionHandle *data, CURLoption option,
                     va_list param)
{
  char *argptr;
  CURLcode result = CURLE_OK;
#ifndef CURL_DISABLE_HTTP
  curl_off_t bigsize;
#endif
  switch(option) {
  case CURLOPT_DNS_CACHE_TIMEOUT:
    data->set.dns_cache_timeout = va_arg(param, long);
    break;
  case CURLOPT_DNS_USE_GLOBAL_CACHE:
    {
      long use_cache = va_arg(param, long);
      if (use_cache)
        Curl_global_host_cache_init();
      data->set.global_dns_cache = (bool)(0 != use_cache);
    }
    break;
  case CURLOPT_SSL_CIPHER_LIST:
    /* set a list of cipher we want to use in the SSL connection */
    result = Curl_setstropt(&data->set.str[STRING_SSL_CIPHER_LIST],
                            va_arg(param, char *));
    break;
  case CURLOPT_RANDOM_FILE:
    /*
     * This is the path name to a file that contains random data to seed
     * the random SSL stuff with. The file is only used for reading.
     */
    result = Curl_setstropt(&data->set.str[STRING_SSL_RANDOM_FILE],
                            va_arg(param, char *));
    break;
  case CURLOPT_EGDSOCKET:
    /*
     * The Entropy Gathering Daemon socket pathname
     */
    result = Curl_setstropt(&data->set.str[STRING_SSL_EGDSOCKET],
                            va_arg(param, char *));
    break;
  case CURLOPT_MAXCONNECTS:
    /*
     * Set the absolute number of maximum simultaneous alive connection that
     * libcurl is allowed to have.
     */
    result = Curl_ch_connc(data, data->state.connc, va_arg(param, long));
    break;
  case CURLOPT_FORBID_REUSE:
    /*
     * When this transfer is done, it must not be left to be reused by a
     * subsequent transfer but shall be closed immediately.
     */
    data->set.reuse_forbid = (bool)(0 != va_arg(param, long));
    break;
  case CURLOPT_FRESH_CONNECT:
    /*
     * This transfer shall not use a previously cached connection but
     * should be made with a fresh new connect!
     */
    data->set.reuse_fresh = (bool)(0 != va_arg(param, long));
    break;
  case CURLOPT_VERBOSE:
    /*
     * Verbose means infof() calls that give a lot of information about
     * the connection and transfer procedures as well as internal choices.
     */
    data->set.verbose = (bool)(0 != va_arg(param, long));
    break;
  case CURLOPT_HEADER:
    /*
     * Set to include the header in the general data output stream.
     */
    data->set.include_header = (bool)(0 != va_arg(param, long));
    break;
  case CURLOPT_NOPROGRESS:
    /*
     * Shut off the internal supported progress meter
     */
    data->set.hide_progress = (bool)(0 != va_arg(param, long));
    if(data->set.hide_progress)
      data->progress.flags |= PGRS_HIDE;
    else
      data->progress.flags &= ~PGRS_HIDE;
    break;
  case CURLOPT_NOBODY:
    /*
     * Do not include the body part in the output data stream.
     */
    data->set.opt_no_body = (bool)(0 != va_arg(param, long));
    if(data->set.opt_no_body)
      /* in HTTP lingo, this means using the HEAD request */
      data->set.httpreq = HTTPREQ_HEAD;
    break;
  case CURLOPT_FAILONERROR:
    /*
     * Don't output the >=300 error code HTML-page, but instead only
     * return error.
     */
    data->set.http_fail_on_error = (bool)(0 != va_arg(param, long));
    break;
  case CURLOPT_UPLOAD:
  case CURLOPT_PUT:
    /*
     * We want to sent data to the remote host. If this is HTTP, that equals
     * using the PUT request.
     */
    data->set.upload = (bool)(0 != va_arg(param, long));
    if(data->set.upload)
      /* If this is HTTP, PUT is what's needed to "upload" */
      data->set.httpreq = HTTPREQ_PUT;
    break;
  case CURLOPT_FILETIME:
    /*
     * Try to get the file time of the remote document. The time will
     * later (possibly) become available using curl_easy_getinfo().
     */
    data->set.get_filetime = (bool)(0 != va_arg(param, long));
    break;
  case CURLOPT_FTP_CREATE_MISSING_DIRS:
    /*
     * An FTP option that modifies an upload to create missing directories on
     * the server.
     */
    data->set.ftp_create_missing_dirs = (bool)(0 != va_arg(param, long));
    break;
  case CURLOPT_FTP_RESPONSE_TIMEOUT:
    /*
     * An FTP option that specifies how quickly an FTP response must be
     * obtained before it is considered failure.
     */
    data->set.ftp_response_timeout = va_arg( param , long ) * 1000;
    break;
  case CURLOPT_DIRLISTONLY:
    /*
     * An option that changes the command to one that asks for a list
     * only, no file info details.
     */
    data->set.ftp_list_only = (bool)(0 != va_arg(param, long));
    break;
  case CURLOPT_APPEND:
    /*
     * We want to upload and append to an existing file.
     */
    data->set.ftp_append = (bool)(0 != va_arg(param, long));
    break;
  case CURLOPT_FTP_FILEMETHOD:
    /*
     * How do access files over FTP.
     */
    data->set.ftp_filemethod = (curl_ftpfile)va_arg(param, long);
    break;
  case CURLOPT_NETRC:
    /*
     * Parse the $HOME/.netrc file
     */
    data->set.use_netrc = (enum CURL_NETRC_OPTION)va_arg(param, long);
    break;
  case CURLOPT_NETRC_FILE:
    /*
     * Use this file instead of the $HOME/.netrc file
     */
    result = Curl_setstropt(&data->set.str[STRING_NETRC_FILE],
                            va_arg(param, char *));
    break;
  case CURLOPT_TRANSFERTEXT:
    /*
     * This option was previously named 'FTPASCII'. Renamed to work with
     * more protocols than merely FTP.
     *
     * Transfer using ASCII (instead of BINARY).
     */
    data->set.prefer_ascii = (bool)(0 != va_arg(param, long));
    break;
  case CURLOPT_TIMECONDITION:
    /*
     * Set HTTP time condition. This must be one of the defines in the
     * curl/curl.h header file.
     */
    data->set.timecondition = (curl_TimeCond)va_arg(param, long);
    break;
  case CURLOPT_TIMEVALUE:
    /*
     * This is the value to compare with the remote document with the
     * method set with CURLOPT_TIMECONDITION
     */
    data->set.timevalue = (time_t)va_arg(param, long);
    break;
  case CURLOPT_SSLVERSION:
    /*
     * Set explicit SSL version to try to connect with, as some SSL
     * implementations are lame.
     */
    data->set.ssl.version = va_arg(param, long);
    break;
#ifndef CURL_DISABLE_HTTP
  case CURLOPT_AUTOREFERER:
    /*
     * Switch on automatic referer that gets set if curl follows locations.
     */
    data->set.http_auto_referer = (bool)(0 != va_arg(param, long));
    break;
  case CURLOPT_ENCODING:
    /*
     * String to use at the value of Accept-Encoding header.
     *
     * If the encoding is set to "" we use an Accept-Encoding header that
     * encompasses all the encodings we support.
     * If the encoding is set to NULL we don't send an Accept-Encoding header
     * and ignore an received Content-Encoding header.
     *
     */
    argptr = va_arg(param, char *);
    result = Curl_setstropt(&data->set.str[STRING_ENCODING],
                            (argptr && !*argptr)?
                            (char *) ALL_CONTENT_ENCODINGS: argptr);
    break;
  case CURLOPT_FOLLOWLOCATION:
    /*
     * Follow Location: header hints on a HTTP-server.
     */
    data->set.http_follow_location = (bool)(0 != va_arg(param, long));
    break;
  case CURLOPT_UNRESTRICTED_AUTH:
    /*
     * Send authentication (user+password) when following locations, even when
     * hostname changed.
     */
    data->set.http_disable_hostname_check_before_authentication =
      (bool)(0 != va_arg(param, long));
    break;
  case CURLOPT_MAXREDIRS:
    /*
     * The maximum amount of hops you allow curl to follow Location:
     * headers. This should mostly be used to detect never-ending loops.
     */
    data->set.maxredirs = va_arg(param, long);
    break;
  case CURLOPT_POST301:
    /*
     * Obey RFC 2616/10.3.2 and resubmit a POST as a POST after a 301.
     */
    data->set.post301 = (bool)(0 != va_arg(param, long));
    break;
  case CURLOPT_POST:
    /* Does this option serve a purpose anymore? Yes it does, when
       CURLOPT_POSTFIELDS isn't used and the POST data is read off the
       callback! */
    if(va_arg(param, long)) {
      data->set.httpreq = HTTPREQ_POST;
      data->set.opt_no_body = FALSE; /* this is implied */
    }
    else
      data->set.httpreq = HTTPREQ_GET;
    break;
  case CURLOPT_COPYPOSTFIELDS:
    /*
     * A string with POST data. Makes curl HTTP POST. Even if it is NULL.
     * If needed, CURLOPT_POSTFIELDSIZE must have been set prior to
     *  CURLOPT_COPYPOSTFIELDS and not altered later.
     */
    argptr = va_arg(param, char *);
    if (!argptr || data->set.postfieldsize == -1)
      result = Curl_setstropt(&data->set.str[STRING_COPYPOSTFIELDS], argptr);
    else {
      /*
       *  Check that requested length does not overflow the size_t type.
       */
      if ((data->set.postfieldsize < 0) ||
          ((sizeof(curl_off_t) != sizeof(size_t)) &&
           (data->set.postfieldsize > (curl_off_t)((size_t)-1))))
        result = CURLE_OUT_OF_MEMORY;
      else {
        char * p;
        (void) Curl_setstropt(&data->set.str[STRING_COPYPOSTFIELDS], NULL);
        /* Allocate even when size == 0. This satisfies the need of possible
           later address compare to detect the COPYPOSTFIELDS mode, and
           to mark that postfields is used rather than read function or
           form data.
         */
        p = malloc((size_t)(data->set.postfieldsize?data->set.postfieldsize:1));
        if (!p)
          result = CURLE_OUT_OF_MEMORY;
        else {
          if (data->set.postfieldsize)
            memcpy(p, argptr, data->set.postfieldsize);
          data->set.str[STRING_COPYPOSTFIELDS] = p;
        }
      }
    }
    data->set.postfields = data->set.str[STRING_COPYPOSTFIELDS];
    data->set.httpreq = HTTPREQ_POST;
    break;
  case CURLOPT_POSTFIELDS:
    /*
     * Like above, but use static data instead of copying it.
     */
    data->set.postfields = va_arg(param, void *);
    /* Release old copied data. */
    (void) Curl_setstropt(&data->set.str[STRING_COPYPOSTFIELDS], NULL);
    data->set.httpreq = HTTPREQ_POST;
    break;
  case CURLOPT_POSTFIELDSIZE:
    /*
     * The size of the POSTFIELD data to prevent libcurl to do strlen() to
     * figure it out. Enables binary posts.
     */
    bigsize = va_arg(param, long);
    if (data->set.postfieldsize < bigsize &&
        data->set.postfields == data->set.str[STRING_COPYPOSTFIELDS]) {
      /* Previous CURLOPT_COPYPOSTFIELDS is no longer valid. */
      (void) Curl_setstropt(&data->set.str[STRING_COPYPOSTFIELDS], NULL);
      data->set.postfields = NULL;
      }
    data->set.postfieldsize = bigsize;
    break;
  case CURLOPT_POSTFIELDSIZE_LARGE:
    /*
     * The size of the POSTFIELD data to prevent libcurl to do strlen() to
     * figure it out. Enables binary posts.
     */
    bigsize = va_arg(param, curl_off_t);
    if (data->set.postfieldsize < bigsize &&
        data->set.postfields == data->set.str[STRING_COPYPOSTFIELDS]) {
      /* Previous CURLOPT_COPYPOSTFIELDS is no longer valid. */
      (void) Curl_setstropt(&data->set.str[STRING_COPYPOSTFIELDS], NULL);
      data->set.postfields = NULL;
      }
    data->set.postfieldsize = bigsize;
    break;
  case CURLOPT_HTTPPOST:
    /*
     * Set to make us do HTTP POST
     */
    data->set.httppost = va_arg(param, struct curl_httppost *);
    data->set.httpreq = HTTPREQ_POST_FORM;
    data->set.opt_no_body = FALSE; /* this is implied */
    break;
  case CURLOPT_REFERER:
    /*
     * String to set in the HTTP Referer: field.
     */
    if(data->change.referer_alloc) {
      free(data->change.referer);
      data->change.referer_alloc = FALSE;
    }
    result = Curl_setstropt(&data->set.str[STRING_SET_REFERER],
                            va_arg(param, char *));
    data->change.referer = data->set.str[STRING_SET_REFERER];
    break;
/**中间省略10000行case情况,但都是想data数据修正值*/
  default:
    /* unknown tag and its companion, just ignore: */
    result = CURLE_FAILED_INIT; /* correct this */
    break;
  }
  return result;
}
复制代码

3.真正发送请求:

复制代码
CURLcode curl_easy_perform(CURL *easy)
{
  CURLM *multi;
  CURLMcode mcode;
  CURLcode code = CURLE_OK;
  int still_running;
  struct timeval timeout;
  int rc;
  CURLMsg *msg;
  fd_set fdread;
  fd_set fdwrite;
  fd_set fdexcep;
  int maxfd;
  if(!easy)
    return CURLE_BAD_FUNCTION_ARGUMENT;
  multi = curl_multi_init();
  if(!multi)
    return CURLE_OUT_OF_MEMORY;
  mcode = curl_multi_add_handle(multi, easy);
  if(mcode) {
    curl_multi_cleanup(multi);
    if(mcode == CURLM_OUT_OF_MEMORY)
      return CURLE_OUT_OF_MEMORY;
    else
      return CURLE_FAILED_INIT;
  }
  /* we start some action by calling perform right away */
  do {
    while(CURLM_CALL_MULTI_PERFORM ==
          curl_multi_perform(multi, &still_running));
    if(!still_running)
      break;
    FD_ZERO(&fdread);
    FD_ZERO(&fdwrite);
    FD_ZERO(&fdexcep);
    /* timeout once per second */
    timeout.tv_sec = 1;
    timeout.tv_usec = 0;
    /* Old deprecated style: get file descriptors from the transfers */
    curl_multi_fdset(multi, &fdread, &fdwrite, &fdexcep, &maxfd);
    rc = Curl_select(maxfd+1, &fdread, &fdwrite, &fdexcep, &timeout);
    /* The way is to extract the sockets and wait for them without using
       select. This whole alternative version should probably rather use the
       curl_multi_socket() approach. */
    if(rc == -1)
      /* select error */
      break;
    /* timeout or data to send/receive => loop! */
  } while(still_running);
  msg = curl_multi_info_read(multi, &rc);
  if(msg)
    code = msg->data.result;
  mcode = curl_multi_remove_handle(multi, easy);
  /* what to do if it fails? */
  mcode = curl_multi_cleanup(multi);
  /* what to do if it fails? */
  return code;
}
复制代码

4.从内存去除申请的空间:

复制代码
void curl_easy_cleanup(CURL *curl)
{
  struct SessionHandle *data = (struct SessionHandle *)curl;
  if(!data)
    return;
  Curl_close(data);
}
复制代码

php:

1.使用的数据结构:

复制代码
typedef struct {
    struct _php_curl_error   err;
    struct _php_curl_free    *to_free;
    struct _php_curl_send_headers header;
    void ***thread_ctx;
    CURL                    *cp; /* php主要申请这个结构体,但这个结构体包含了C的CURL这个类型的结构体,所以可以采用ch->cp来设置这个结构体内容*/
    php_curl_handlers       *handlers;
    long                     id;
    unsigned int             uses;
    zend_bool                in_callback;
    zval                     *clone;
} php_curl;
复制代码

2. 使用的方法:

复制代码
PHP_FUNCTION(curl_init)
{
    php_curl    *ch;
    CURL        *cp;
    zval        *clone;
    char        *url = NULL;
    int        url_len = 0;
    char *cainfo;
    if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "|s", &url, &url_len) == FAILURE) {
        return;
    }
    cp = curl_easy_init();
    if (!cp) {
        php_error_docref(NULL TSRMLS_CC, E_WARNING, "Could not initialize a new cURL handle");
        RETURN_FALSE;
    }
    alloc_curl_handle(&ch);
    TSRMLS_SET_CTX(ch->thread_ctx);
    ch->cp = cp;
    ch->handlers->write->method = PHP_CURL_STDOUT;
    ch->handlers->write->type   = PHP_CURL_ASCII;
    ch->handlers->read->method  = PHP_CURL_DIRECT;
    ch->handlers->write_header->method = PHP_CURL_IGNORE;
    ch->uses = 0;
    MAKE_STD_ZVAL(clone);
    ch->clone = clone;
    curl_easy_setopt(ch->cp, CURLOPT_NOPROGRESS,        1);
    curl_easy_setopt(ch->cp, CURLOPT_VERBOSE,           0);
    curl_easy_setopt(ch->cp, CURLOPT_ERRORBUFFER,       ch->err.str);
    curl_easy_setopt(ch->cp, CURLOPT_WRITEFUNCTION,     curl_write);
    curl_easy_setopt(ch->cp, CURLOPT_FILE,              (void *) ch);
    curl_easy_setopt(ch->cp, CURLOPT_READFUNCTION,      curl_read);
    curl_easy_setopt(ch->cp, CURLOPT_INFILE,            (void *) ch);
    curl_easy_setopt(ch->cp, CURLOPT_HEADERFUNCTION,    curl_write_header);
    curl_easy_setopt(ch->cp, CURLOPT_WRITEHEADER,       (void *) ch);
    curl_easy_setopt(ch->cp, CURLOPT_DNS_USE_GLOBAL_CACHE, 1);
    curl_easy_setopt(ch->cp, CURLOPT_DNS_CACHE_TIMEOUT, 120);
    curl_easy_setopt(ch->cp, CURLOPT_MAXREDIRS, 20); /* prevent infinite redirects */
    cainfo = INI_STR("curl.cainfo");
    if (cainfo && strlen(cainfo) > 0) {
        curl_easy_setopt(ch->cp, CURLOPT_CAINFO, cainfo);
    }
#if defined(ZTS)    curl_easy_setopt(ch->cp, CURLOPT_NOSIGNAL, 1);
#endif
    if (url) {
        if (!php_curl_option_url(ch, url, url_len)) {
            _php_curl_close_ex(ch TSRMLS_CC);
            RETURN_FALSE;
        }
    }
    ZEND_REGISTER_RESOURCE(return_value, ch, le_curl);
    ch->id = Z_LVAL_P(return_value);
}
复制代码

执行真实下载

复制代码
PHP_FUNCTION(curl_exec)
{
    CURLcode    error;
    zval        *zid;
    php_curl    *ch;
    if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "r", &zid) == FAILURE) {
        return;
    }
    ZEND_FETCH_RESOURCE(ch, php_curl *, &zid, -1, le_curl_name, le_curl);
    _php_curl_verify_handlers(ch, 1 TSRMLS_CC);
    _php_curl_cleanup_handle(ch);
    error = curl_easy_perform(ch->cp);
    SAVE_CURL_ERROR(ch, error);
    /* CURLE_PARTIAL_FILE is returned by HEAD requests */
    if (error != CURLE_OK && error != CURLE_PARTIAL_FILE) {
        if (ch->handlers->write->buf.len > 0) {
            smart_str_free(&ch->handlers->write->buf);
        }
        RETURN_FALSE;
    }
    if (ch->handlers->std_err) {
        php_stream  *stream;
        stream = (php_stream*)zend_fetch_resource(&ch->handlers->std_err TSRMLS_CC, -1, NULL, NULL, 2, php_file_le_stream(), php_file_le_pstream());
        if (stream) {
            php_stream_flush(stream);
        }
    }
    if (ch->handlers->write->method == PHP_CURL_RETURN && ch->handlers->write->buf.len > 0) {
        smart_str_0(&ch->handlers->write->buf);
        RETURN_STRINGL(ch->handlers->write->buf.c, ch->handlers->write->buf.len, 1);
    }
    /* flush the file handle, so any remaining data is synched to disk */
    if (ch->handlers->write->method == PHP_CURL_FILE && ch->handlers->write->fp) {
        fflush(ch->handlers->write->fp);
    }
    if (ch->handlers->write_header->method == PHP_CURL_FILE && ch->handlers->write_header->fp) {
        fflush(ch->handlers->write_header->fp);
    }
    if (ch->handlers->write->method == PHP_CURL_RETURN) {
        RETURN_EMPTY_STRING();
    } else {
        RETURN_TRUE;
    }
}
复制代码

关闭程序,清空内存

复制代码
PHP_FUNCTION(curl_close)
{
    zval        *zid;
    php_curl    *ch;
    if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "r", &zid) == FAILURE) {
        return;
    }
    ZEND_FETCH_RESOURCE(ch, php_curl *, &zid, -1, le_curl_name, le_curl);
    if (ch->in_callback) {
        php_error_docref(NULL TSRMLS_CC, E_WARNING, "Attempt to close cURL handle from a callback");
        return;
    }
    if (ch->uses) {
        ch->uses--;
    } else {
        zend_list_delete(Z_LVAL_P(zid));
    }
}
复制代码

python的pycurl

1.使用的数据结构:

复制代码
typedef struct {
    PyObject_HEAD
    PyObject *dict;                 /* Python attributes dictionary */
    CURL *handle;                   /*引用C的curl的数据结构*/
    PyThreadState *state;
    CurlMultiObject *multi_stack;
    CurlShareObject *share;
    struct curl_httppost *httppost;
    struct curl_slist *httpheader;
    struct curl_slist *http200aliases;
    struct curl_slist *quote;
    struct curl_slist *postquote;
    struct curl_slist *prequote;
    /* callbacks */
    PyObject *w_cb;
    PyObject *h_cb;
    PyObject *r_cb;
    PyObject *pro_cb;
    PyObject *debug_cb;
    PyObject *ioctl_cb;
    PyObject *opensocket_cb;
    /* file objects */
    PyObject *readdata_fp;
    PyObject *writedata_fp;
    PyObject *writeheader_fp;
    /* misc */
    void *options[OPTIONS_SIZE];    /* for OBJECTPOINT options */
    char error[CURL_ERROR_SIZE+1];
} CurlObject;
复制代码

方法:

1.初始化对象:

复制代码
static CurlObject *
do_curl_new(PyObject *dummy)
{
    CurlObject *self = NULL;
    int res;
    char *s = NULL;
    UNUSED(dummy);
    /* Allocate python curl object */
    self = util_curl_new();
    if (self == NULL)
        return NULL;
    /* Initialize curl handle */
    self->handle = curl_easy_init();
    if (self->handle == NULL)
        goto error;
    /* Set curl error buffer and zero it */
    res = curl_easy_setopt(self->handle, CURLOPT_ERRORBUFFER, self->error);
    if (res != CURLE_OK)
        goto error;
    memset(self->error, 0, sizeof(self->error));
    /* Set backreference */
    res = curl_easy_setopt(self->handle, CURLOPT_PRIVATE, (char *) self);
    if (res != CURLE_OK)
        goto error;
    /* Enable NOPROGRESS by default, i.e. no progress output */
    res = curl_easy_setopt(self->handle, CURLOPT_NOPROGRESS, (long)1);
    if (res != CURLE_OK)
        goto error;
    /* Disable VERBOSE by default, i.e. no verbose output */
    res = curl_easy_setopt(self->handle, CURLOPT_VERBOSE, (long)0);
    if (res != CURLE_OK)
        goto error;
    /* Set FTP_ACCOUNT to NULL by default */
    res = curl_easy_setopt(self->handle, CURLOPT_FTP_ACCOUNT, NULL);
    if (res != CURLE_OK)
        goto error;
    /* Set default USERAGENT */
    s = (char *) malloc(7 + strlen(LIBCURL_VERSION) + 1);
    if (s == NULL)
        goto error;
    strcpy(s, "PycURL/"); strcpy(s+7, LIBCURL_VERSION);
    res = curl_easy_setopt(self->handle, CURLOPT_USERAGENT, (char *) s); /*主要在这里调用c的curl的curl_easy_setopt方法,生成一个CURLsessionhandler结构体*/
    if (res != CURLE_OK) {
        free(s);
        goto error;
    }
    self->options[ OPT_INDEX(CURLOPT_USERAGENT) ] = s; s = NULL;
    /* Success - return new object */
    return self;
error:
    Py_DECREF(self);    /* this also closes self->handle */
    PyErr_SetString(ErrorObject, "initializing curl failed");
    return NULL;
}
复制代码

2.设置参数

复制代码
do_curl_setopt(CurlObject *self, PyObject *args)
{
    int option;
    PyObject *obj;
    int res;
    if (!PyArg_ParseTuple(args, "iO:setopt", &option, &obj))
        return NULL;
    if (check_curl_state(self, 1 | 2, "setopt") != 0)
        return NULL;
    /* early checks of option value */
    if (option <= 0)
        goto error;
    if (option >= (int)CURLOPTTYPE_OFF_T + OPTIONS_SIZE)
        goto error;
    if (option % 10000 >= OPTIONS_SIZE)
        goto error;
#if 0 /* XXX - should we ??? */
    /* Handle the case of None */
    if (obj == Py_None) {
        return util_curl_unsetopt(self, option);
    }
#endif
    /* Handle the case of string arguments */
    if (PyString_Check(obj)) {
        char *str = NULL;
        Py_ssize_t len = -1;
        char *buf;
        int opt_index;
        /* Check that the option specified a string as well as the input */
        switch (option) {
        case CURLOPT_CAINFO:
/*此处省略10000行,为pycurl未实现的curl的功能*/
        case CURLOPT_CRLFILE:
        case CURLOPT_ISSUERCERT:
/* FIXME: check if more of these options allow binary data */
            str = PyString_AsString_NoNUL(obj);
            if (str == NULL)
                return NULL;
            break;
        case CURLOPT_POSTFIELDS:
            if (PyString_AsStringAndSize(obj, &str, &len) != 0)
                return NULL;
            /* automatically set POSTFIELDSIZE */
            if (len <= INT_MAX) {
                res = curl_easy_setopt(self->handle, CURLOPT_POSTFIELDSIZE, (long)len); /*可以看到pycurl的设置参数也就是使用的c的curl的curl_easy_setopt,即是对C的curl的一种封装*/
            } else {
                res = curl_easy_setopt(self->handle, CURLOPT_POSTFIELDSIZE_LARGE, (curl_off_t)len);
            }
            if (res != CURLE_OK) {
                CURLERROR_RETVAL();
            }
            break;
        default:
            PyErr_SetString(PyExc_TypeError, "strings are not supported for this option");
            return NULL;
        }
        /* Allocate memory to hold the string */
        assert(str != NULL);
        if (len <= 0)
            buf = strdup(str);
        else {
            buf = (char *) malloc(len);
            if (buf) memcpy(buf, str, len);
        }
        if (buf == NULL)
            return PyErr_NoMemory();
        /* Call setopt */
        res = curl_easy_setopt(self->handle, (CURLoption)option, buf);
        /* Check for errors */
        if (res != CURLE_OK) {
            free(buf);
            CURLERROR_RETVAL();
        }
        /* Save allocated option buffer */
        opt_index = OPT_INDEX(option);
        if (self->options[opt_index] != NULL) {
            free(self->options[opt_index]);
            self->options[opt_index] = NULL;
        }
        self->options[opt_index] = buf;
        Py_INCREF(Py_None);
        return Py_None;
    }
复制代码

3.关闭连接,或者说是删除内存中对象。

复制代码
static PyObject *
do_curl_close(CurlObject *self)
{
    if (check_curl_state(self, 2, "close") != 0) {
        return NULL;
    }
    util_curl_close(self); /*删除了CurlObject对象*/
    Py_INCREF(Py_None);
    return Py_None;
}
复制代码

由以上分析可以看出,php的curl和python的curl都是对curl的一种封装,如果想写出一个更符合自己需求的配置型爬虫,可以考虑直接用C写,不过C的爬虫是不适合快速开发,这由代码量决定。

当然更好的建议是使用webkit做爬虫,作为部分浏览器内核,毋庸置疑。以后再说.

声明:本站所有文章,如无特殊说明或标注,均为本站原创发布。任何个人或组织,在未征得本站同意时,禁止复制、盗用、采集、发布本站内容到任何网站、书籍等各类媒体平台。如若本站内容侵犯了原著者的合法权益,可联系我们进行处理。