302 lines
		
	
	
		
			5.5 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			302 lines
		
	
	
		
			5.5 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| /*****************************************************************************
 | |
|  *                                  _   _ ____  _
 | |
|  *  Project                     ___| | | |  _ \| |
 | |
|  *                             / __| | | | |_) | |
 | |
|  *                            | (__| |_| |  _ <| |___
 | |
|  *                             \___|\___/|_| \_\_____|
 | |
|  *
 | |
|  * $Id: htmltitle.cc,v 1.4 2008-05-22 21:20:09 danf Exp $
 | |
|  */
 | |
| 
 | |
| // Get a web page, parse it with libxml.
 | |
| //
 | |
| // Written by Lars Nilsson
 | |
| //
 | |
| // GNU C++ compile command line suggestion (edit paths accordingly):
 | |
| //
 | |
| // g++ -Wall -I/opt/curl/include -I/opt/libxml/include/libxml2 htmltitle.cc \
 | |
| // -o htmltitle -L/opt/curl/lib -L/opt/libxml/lib -lcurl -lxml2
 | |
| 
 | |
| #include <stdio.h>
 | |
| #include <string.h>
 | |
| #include <stdlib.h>
 | |
| #include <string>
 | |
| #include <curl/curl.h>
 | |
| #include <libxml/HTMLparser.h>
 | |
| 
 | |
| //
 | |
| //  Case-insensitive string comparison
 | |
| //
 | |
| 
 | |
| #ifdef _MSC_VER
 | |
| #define COMPARE(a, b) (!stricmp((a), (b)))
 | |
| #else
 | |
| #define COMPARE(a, b) (!strcasecmp((a), (b)))
 | |
| #endif
 | |
| 
 | |
| //
 | |
| //  libxml callback context structure
 | |
| //
 | |
| 
 | |
| struct Context
 | |
| {
 | |
|   Context(): addTitle(false) { }
 | |
| 
 | |
|   bool addTitle;
 | |
|   std::string title;
 | |
| };
 | |
| 
 | |
| //
 | |
| //  libcurl variables for error strings and returned data
 | |
| 
 | |
| static char errorBuffer[CURL_ERROR_SIZE];
 | |
| static std::string buffer;
 | |
| 
 | |
| //
 | |
| //  libcurl write callback function
 | |
| //
 | |
| 
 | |
| static int writer(char *data, size_t size, size_t nmemb,
 | |
|                   std::string *writerData)
 | |
| {
 | |
|   if (writerData == NULL)
 | |
|     return 0;
 | |
| 
 | |
|   writerData->append(data, size*nmemb);
 | |
| 
 | |
|   return size * nmemb;
 | |
| }
 | |
| 
 | |
| //
 | |
| //  libcurl connection initialization
 | |
| //
 | |
| 
 | |
| static bool init(CURL *&conn, char *url)
 | |
| {
 | |
|   CURLcode code;
 | |
| 
 | |
|   conn = curl_easy_init();
 | |
| 
 | |
|   if (conn == NULL)
 | |
|   {
 | |
|     fprintf(stderr, "Failed to create CURL connection\n");
 | |
| 
 | |
|     exit(EXIT_FAILURE);
 | |
|   }
 | |
| 
 | |
|   code = curl_easy_setopt(conn, CURLOPT_ERRORBUFFER, errorBuffer);
 | |
|   if (code != CURLE_OK)
 | |
|   {
 | |
|     fprintf(stderr, "Failed to set error buffer [%d]\n", code);
 | |
| 
 | |
|     return false;
 | |
|   }
 | |
| 
 | |
|   code = curl_easy_setopt(conn, CURLOPT_URL, url);
 | |
|   if (code != CURLE_OK)
 | |
|   {
 | |
|     fprintf(stderr, "Failed to set URL [%s]\n", errorBuffer);
 | |
| 
 | |
|     return false;
 | |
|   }
 | |
| 
 | |
|   code = curl_easy_setopt(conn, CURLOPT_FOLLOWLOCATION, 1L);
 | |
|   if (code != CURLE_OK)
 | |
|   {
 | |
|     fprintf(stderr, "Failed to set redirect option [%s]\n", errorBuffer);
 | |
| 
 | |
|     return false;
 | |
|   }
 | |
| 
 | |
|   code = curl_easy_setopt(conn, CURLOPT_WRITEFUNCTION, writer);
 | |
|   if (code != CURLE_OK)
 | |
|   {
 | |
|     fprintf(stderr, "Failed to set writer [%s]\n", errorBuffer);
 | |
| 
 | |
|     return false;
 | |
|   }
 | |
| 
 | |
|   code = curl_easy_setopt(conn, CURLOPT_WRITEDATA, &buffer);
 | |
|   if (code != CURLE_OK)
 | |
|   {
 | |
|     fprintf(stderr, "Failed to set write data [%s]\n", errorBuffer);
 | |
| 
 | |
|     return false;
 | |
|   }
 | |
| 
 | |
|   return true;
 | |
| }
 | |
| 
 | |
| //
 | |
| //  libxml start element callback function
 | |
| //
 | |
| 
 | |
| static void StartElement(void *voidContext,
 | |
|                          const xmlChar *name,
 | |
|                          const xmlChar **attributes)
 | |
| {
 | |
|   Context *context = (Context *)voidContext;
 | |
| 
 | |
|   if (COMPARE((char *)name, "TITLE"))
 | |
|   {
 | |
|     context->title = "";
 | |
|     context->addTitle = true;
 | |
|   }
 | |
| }
 | |
| 
 | |
| //
 | |
| //  libxml end element callback function
 | |
| //
 | |
| 
 | |
| static void EndElement(void *voidContext,
 | |
|                        const xmlChar *name)
 | |
| {
 | |
|   Context *context = (Context *)voidContext;
 | |
| 
 | |
|   if (COMPARE((char *)name, "TITLE"))
 | |
|     context->addTitle = false;
 | |
| }
 | |
| 
 | |
| //
 | |
| //  Text handling helper function
 | |
| //
 | |
| 
 | |
| static void handleCharacters(Context *context,
 | |
|                              const xmlChar *chars,
 | |
|                              int length)
 | |
| {
 | |
|   if (context->addTitle)
 | |
|     context->title.append((char *)chars, length);
 | |
| }
 | |
| 
 | |
| //
 | |
| //  libxml PCDATA callback function
 | |
| //
 | |
| 
 | |
| static void Characters(void *voidContext,
 | |
|                        const xmlChar *chars,
 | |
|                        int length)
 | |
| {
 | |
|   Context *context = (Context *)voidContext;
 | |
| 
 | |
|   handleCharacters(context, chars, length);
 | |
| }
 | |
| 
 | |
| //
 | |
| //  libxml CDATA callback function
 | |
| //
 | |
| 
 | |
| static void cdata(void *voidContext,
 | |
|                   const xmlChar *chars,
 | |
|                   int length)
 | |
| {
 | |
|   Context *context = (Context *)voidContext;
 | |
| 
 | |
|   handleCharacters(context, chars, length);
 | |
| }
 | |
| 
 | |
| //
 | |
| //  libxml SAX callback structure
 | |
| //
 | |
| 
 | |
| static htmlSAXHandler saxHandler =
 | |
| {
 | |
|   NULL,
 | |
|   NULL,
 | |
|   NULL,
 | |
|   NULL,
 | |
|   NULL,
 | |
|   NULL,
 | |
|   NULL,
 | |
|   NULL,
 | |
|   NULL,
 | |
|   NULL,
 | |
|   NULL,
 | |
|   NULL,
 | |
|   NULL,
 | |
|   NULL,
 | |
|   StartElement,
 | |
|   EndElement,
 | |
|   NULL,
 | |
|   Characters,
 | |
|   NULL,
 | |
|   NULL,
 | |
|   NULL,
 | |
|   NULL,
 | |
|   NULL,
 | |
|   NULL,
 | |
|   NULL,
 | |
|   cdata,
 | |
|   NULL
 | |
| };
 | |
| 
 | |
| //
 | |
| //  Parse given (assumed to be) HTML text and return the title
 | |
| //
 | |
| 
 | |
| static void parseHtml(const std::string &html,
 | |
|                       std::string &title)
 | |
| {
 | |
|   htmlParserCtxtPtr ctxt;
 | |
|   Context context;
 | |
| 
 | |
|   ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "",
 | |
|                                   XML_CHAR_ENCODING_NONE);
 | |
| 
 | |
|   htmlParseChunk(ctxt, html.c_str(), html.size(), 0);
 | |
|   htmlParseChunk(ctxt, "", 0, 1);
 | |
| 
 | |
|   htmlFreeParserCtxt(ctxt);
 | |
| 
 | |
|   title = context.title;
 | |
| }
 | |
| 
 | |
| int main(int argc, char *argv[])
 | |
| {
 | |
|   CURL *conn = NULL;
 | |
|   CURLcode code;
 | |
|   std::string title;
 | |
| 
 | |
|   // Ensure one argument is given
 | |
| 
 | |
|   if (argc != 2)
 | |
|   {
 | |
|     fprintf(stderr, "Usage: %s <url>\n", argv[0]);
 | |
| 
 | |
|     exit(EXIT_FAILURE);
 | |
|   }
 | |
| 
 | |
|   curl_global_init(CURL_GLOBAL_DEFAULT);
 | |
| 
 | |
|   // Initialize CURL connection
 | |
| 
 | |
|   if (!init(conn, argv[1]))
 | |
|   {
 | |
|     fprintf(stderr, "Connection initializion failed\n");
 | |
| 
 | |
|     exit(EXIT_FAILURE);
 | |
|   }
 | |
| 
 | |
|   // Retrieve content for the URL
 | |
| 
 | |
|   code = curl_easy_perform(conn);
 | |
|   curl_easy_cleanup(conn);
 | |
| 
 | |
|   if (code != CURLE_OK)
 | |
|   {
 | |
|     fprintf(stderr, "Failed to get '%s' [%s]\n", argv[1], errorBuffer);
 | |
| 
 | |
|     exit(EXIT_FAILURE);
 | |
|   }
 | |
| 
 | |
|   // Parse the (assumed) HTML code
 | |
| 
 | |
|   parseHtml(buffer, title);
 | |
| 
 | |
|   // Display the extracted title
 | |
| 
 | |
|   printf("Title: %s\n", title.c_str());
 | |
| 
 | |
|   return EXIT_SUCCESS;
 | |
| }
 |