#include "list.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <netdb.h>
#include <arpa/inet.h>
#include <sys/uio.h>
#include <unistd.h>
#include <strings.h>
#include <ctype.h>
#include <regex.h>

#ifdef DEBUG
#define dprintf(a) printf a;
#else
#define dprintf(a)
#endif

/*********************************************************************
 * function:    char *my_malloc(size_t size)                         *
 *                                                                   *
 * parameters:  size    specifies a number of bytes of memory        *
 *                                                                   *
 * description: Tries to allocate memory. If there is no memory      *
 *              available at that moment it tries again at one       *
 *		second intervals until allocation succeeds.          *
 *                                                                   *
 * return values: Returns a pointer to a block of memory of at least *
 *                the number of bytes specified by the size          *
 *                parameter. Never returns NULL.                     *
 *********************************************************************/

char *my_malloc(size_t size)
{
  char *temp;
  
  while((temp = malloc(size)) == NULL){
    fprintf(stderr, "Warning, not enough memory, trying again in a second\n");
    sleep(1);
  }
  return temp;
}

/*********************************************************************
 * function:    int opensocket(char *hostname, int port)             *
 *                                                                   *
 * parameters:  *hostname    hostname to connect to                  *
 *              port         port to connect to                      *
 *                                                                   *
 * description: Opens socket to the given hosts given port.          *
 *                                                                   *
 * return values: Returns 0 on error, socket's descriptor otherwise. * 
 *********************************************************************/

int opensocket(char *hostname, int port)
{
  int skt;
  struct sockaddr_in sktin;
  struct hostent *host;
  
  /* allocate a socket */
  if ((skt = socket(PF_INET, SOCK_STREAM, 0))<0) {
    fprintf(stderr,"Error - can't create socket\n");
    return 0;
  }
  memset(&sktin,0,sizeof(sktin));
  sktin.sin_family = AF_INET;

  /* set port number */
  sktin.sin_port = htons(port);

  /* map host name to ip address */
  if ((host = gethostbyname(hostname)))
    memcpy(&sktin.sin_addr,host->h_addr,host->h_length);
  else if ((sktin.sin_addr.s_addr = inet_addr(hostname))<0) { /* dotted dec */
    fprintf(stderr,"Error - can't get host entry for %s\n",hostname);
    return 0;
  }

  /* connect to the socket */
  if (connect(skt,(struct sockaddr *)&sktin,sizeof(sktin))<0) {
    fprintf(stderr,"Error - can't connect to %s:%d\n",hostname,port);
    return 0;
  }

  return(skt);
}

/*********************************************************************
 * function:    void geturl(char *str, char **hostname,              *
 *                          char **resource, int *httpport)          *
 *                                                                   *
 * parameters:  *str        string with contains url                 *
 *              **hostname  name of the host                         *
 *              **resource  url without protocol and hostname        *
 *              *httpport   number of the httpport                   *
 *                                                                   *
 * description: Separates hostname, resource and httpport from the   *
 *              given string.                                        *
 *********************************************************************/

void geturl(char *str, char **hostname, char **resource, int *httpport)
{
  char *buf = NULL;
  char *cp;

  /* parse URL from argument string */
  if ((cp = strstr(str,"//"))) {
    buf = my_malloc(strlen(cp)+1);
    strcpy(buf,cp+2);      /* skip protocol */
  } else {
    buf = my_malloc(strlen(str)+1);
    strcpy(buf, str);
  }

  /* parse resource path */
  if ((cp = strchr(buf,'/'))) {
    *cp = '\0';
    *resource = my_malloc(strlen(cp+1)+2);
    strcpy(*resource,"/");
    strcat(*resource,cp+1);
  } else {
    *resource = my_malloc(2);
    strcpy(*resource,"/");
  }

  /* get hostname and port */
  *hostname = my_malloc(strlen(buf)+1);
  strcpy(*hostname,buf);
  if ((cp = strchr(*hostname,':'))) {
    *cp = '\0';
    *httpport = atoi(cp+1);
  }
  free(buf);
}

/*********************************************************************
 * function:    char get_url_base(const char *url)                   *
 *                                                                   *
 * parameters:  *url         url                                     *
 *                                                                   *
 * description: Removes last characters from url, until the last     *
 *              '/' mark.                                            *
 *                                                                   *
 * return values: Returns the base part of the url, without the      *
 *                characters after the last '/' mark.                *
 *********************************************************************/

char* get_url_base(const char *url)
{
  int length;
  int i;
  char *u;

  length = strlen(url);

  for (i=length; url[i] != '/'; i--);

  i++;
  u = (char*)my_malloc(i+1);
  strncpy(u, url, i);
  u[i] = '\0';
  return u;
}  

/*********************************************************************
 * function:    int getlink(char *str, int state, list *link_list,   *
 *                          char *url_base, char *hostname)          *
 *                                                                   *
 * parameters:  *str         null-terminated buffer of html code     *
 *              state        state in statemachine                   *
 *              *link_list   list where links are                    *
 *              *url_base    the base part of url                    *
 *              *hostname    name of the host                        *
 *                                                                   *
 * description: Checks if current buffer contains links. If it does, *
 *              stores links to the link list.                       *
 *                                                                   *
 * return values: Returns the state of the state machine after the   *
 *                the buffer is processed.                           * 
 *********************************************************************/
   
int getlink(char *str, 
	    int state, 
	    list *link_list, 
	    char *url_base, 
	    char *hostname)
{
  char *tempstr;
  int item;
  int i, urllength, tmp;
  char *url  = NULL;
  
  /* Use a state machine to extract links from html code*/
  for (i = 0; str[i] != '\0'; i++) {
    item = str[i];
    
    switch(state) {
      
    case 0: 
      if (item == '<')
	state = 1;
      else 
	state = 0;
      break;
    
    case 1: 
      if (isspace(item)) 
	state = 1;
      else if (item == 'a' || item == 'A') 
	state = 2;
      else if (strncasecmp(str+i, "link", 4) == 0) {
	i = i+3;
	state = 2;
      }
      else if (strncasecmp(str+i, "frame", 5) == 0) {
	i = i+4;
	state = 10;
      }
      else 
	state = 8;
      break;

    case 2:
      if (isspace(item)) 
	state = 3;
      else if (item == '>') 
	state = 0;
      else 
	state = 2;
      break;

    case 3:
      if (strncasecmp(str+i, "href", 4) == 0) {
	i = i + 3;
	state = 4; 
      }
      else if (isspace(item))
	state = 3;
      else if (item == '>')
	state = 0;
      else 
	state = 2;
      break;

    case 4:
      if (isspace(item)) 
	state = 4;
      else if (item == '=') 
	state = 5;
      else 
	state = 0;
      break;

    case 5:
      if (isspace(item)) 
	state = 5;
      else if (item == '>') 
	state = 0;
      else if (item == '"') 
	state = 6;
      else {
	i--;
	state = 9;
      }
      break;

    case 6:
      urllength = 0; 
      tmp = i;
      /* count the length of the url */
      while (str[i] != '\n' && str[i] != '"') {
	urllength++;
	i++;
      }
      i = tmp;
      url = (char *)my_malloc(urllength+1);
      strncpy(url, str+i, urllength);
      url[urllength] = '\0';
      i += urllength;
    
      /* if the url is complete add it to the url list */
      if (strncasecmp(url, "http://", 7) == 0) {
	/* Don't put duplicates in the list */
	if (!is_in_list(*link_list, url)) 
	  *link_list = add_to_front(*link_list, url);

      /* url is related to current server base, add it */
      } else if (url[0] == '/') {
	tmp = 7+strlen(hostname) + strlen(url);
	tempstr = my_malloc(tmp+1);
	tempstr[0] = '\0';
	strcat(tempstr, "http://");
	strcat(tempstr, hostname);
	strcat(tempstr, url);
	tempstr[tmp] = '\0';
	free(url);
	/* Don't put duplicates in the list */
	if (!is_in_list(*link_list, tempstr)) 
	  *link_list = add_to_front(*link_list, tempstr);

      /* url is a local reference, add base part */ 	
      } else {
	tmp = strlen(url_base) + strlen(url);
	tempstr = my_malloc(tmp+1);
	tempstr[0] = '\0';
	strcat(tempstr, url_base);
	strcat(tempstr, url);
	tempstr[tmp] = '\0';
	free(url);
	/* Don't put duplicates in the list */
	if (!is_in_list(*link_list, tempstr)) 
	  *link_list = add_to_front(*link_list, tempstr);
      }
      state = 7;
      break;
      
    case 7:
      if (item == '>') {
	state = 0;
      } else 
	state = 7;
      break;

    case 8:
      if (item == '>')
	state = 0;
      else
	state = 8;
      break;
      
    case 9:
      urllength = 0; tmp = i;
      /* count the length of the url */
      while (!isspace((int)str[i]) && str[i] != '>') {
	urllength++;
	i++;
      }
      i = tmp;
      url = (char *)my_malloc(urllength+1);
      strncpy(url, str+i, urllength);
      url[urllength] = '\0';
      i += urllength;

      /* if the url is complete add it to the url list */
      if (strncasecmp(url, "http://", 7) == 0) {
	/* Don't put duplicates in the list */
	if (!is_in_list(*link_list, url)) 
	  *link_list = add_to_front(*link_list, url);
      
      /* url is related to current server base, add it */
      }else if (url[0] == '/') {
	tmp = 7+strlen(hostname) + strlen(url);
	tempstr = my_malloc(tmp+1);
	tempstr[0] = '\0';
	strcat(tempstr, "http://");
	strcat(tempstr, hostname);
	strcat(tempstr, url);
	tempstr[tmp] = '\0';
	free(url);
	/* Don't put duplicates in the list */
	if (!is_in_list(*link_list, tempstr)) 
	  *link_list = add_to_front(*link_list, tempstr);
	      
      /* url is a local reference, add base part */ 	
      } else {
	tmp = strlen(url_base) + strlen(url);
	tempstr = my_malloc(tmp+1);
	tempstr[0] = '\0';
	strcat(tempstr, url_base);
	strcat(tempstr, url);
	tempstr[tmp] = '\0';
	free(url);
	/* Don't put duplicates in the list */
	if (!is_in_list(*link_list, tempstr)) 
	  *link_list = add_to_front(*link_list, tempstr);
      }
      state = 7;
      break;
      
    case 10:
      if (isspace(item)) 
	state = 11;
      else if (item == '>') 
	state = 0;
      else 
	state = 10;
      break;

    case 11:
      if (strncasecmp(str+i, "src", 3) == 0) {
	i = i + 2;
	state = 4; 
      }
      else if (isspace(item))
	state = 11;
      else if (item == '>')
	state = 0;
      else 
	state = 10;
      break;
    }
  }
    
  return state;
}

/*********************************************************************
 * function:    int recvline(int fd, char **cptr, int *len,          *
 *                           int *maxlen)                            *
 *                                                                   *
 * parameters:  fd           the stream socket handle                *
 *              *cptr        pointer to char buffer                  *
 *              *len         characters read                         *
 *              *maxlen      maximum buffer length                   *
 *                                                                   *
 * description: Reads a newline-terminated string from a stream      *
 *              socket descriptor.                                   *
 *                                                                   *
 * return values: Returns 1 on error, 0 otherwise.                   *
 *********************************************************************/

int recvline(int fd, char **cptr, int *len, int *maxlen)
{
  int i,n;      

  *len = 0;
  i = 0;
  for(;;) {
    if (i == *maxlen-1) {
      *cptr = realloc(*cptr, *maxlen *= 2);
    } 
    n = read(fd,*cptr+i,1);
    
    if (n<0) {
      perror("read");
      return(1);
    } else if (n==0) {
      (*cptr)[i] = '\0';
      *len = i;
      return(1);     /* EOF */
    } else if ((*cptr)[i] == '\r')
      /* watch for cr-lf */
      continue;
    else if ((*cptr)[i] == '\n') {
      i++;
      break;
    }
    i++;
  }
  (*cptr)[i] = '\0';
  *len = i;


  return(0);
}

/*********************************************************************
 * function:    int sendstr(int fd, char *cptr)                      *
 *                                                                   *
 * parameters:  fd           the stream socket handle                *
 *              *cptr        pointer to char buffer                  *
 *                                                                   *
 * description: Writes an ascii string to a stream socket descriptor.*
 *                                                                   *
 * return values: Returns 1 on error, 0 otherwise.                   *
 *********************************************************************/

int sendstr(int fd, char *cptr)
{
  if (write(fd,cptr,strlen(cptr))<0) {
    perror("write");
    return(1);
  }
  return(0);
}

struct link_list_header {
  int level;
  char *url;
}; 

/*********************************************************************
 * function:    list links_in_url(char *url, list url_list)          *
 *                                                                   *
 * parameters:  *url         url                                     *
 *              url_list     list of urls                            *
 *                                                                   *
 * description: Searches for the given url from the (given) list of  *
 *              processed urls. If it is found the list of links in  *  
 *              the url is returned.                                 *
 *                                                                   *
 * return values: Returns a list of urls.                            *
 *********************************************************************/

list links_in_url(char *url, list url_list)
{
  while(url_list->next != NULL) {
    if(strcmp((char *)((struct link_list_header *)((list) url_list->data)->data)->url, url) == 0)
      return url_list->data;
    url_list = url_list->next;
  }
  
  return NULL;
}

/*********************************************************************
 * function:    int grep_url(char *url, list *url_list,              *
 *                           int current_depth, int max_depth        *
 *                           int readable, regex_t *preg)            *
 *                                                                   *
 * parameters:  *url           url to grep                           *
 *              *url_list      list of all found urls                *
 *              current_depth  the current depth in url tree         *
 *              max_depth      maximum depth for recursion           *
 *              readable       flag for more readable output         *
 *              *preg          compiled regexp                       *
 *                                                                   *
 * description: Greps from the given url the compiled regexp. Any    *
 *              lines that match are printed to stdout. Links are    *
 *              followed recursively until maxdepth is reached.      *
 *              Unnecessary link loops are detected and avoided, and *
 *              no url will be processed more than once.             *
 *                                                                   *
 * return values: Returns 0                                          *
 *********************************************************************/

int grep_url(char *url, list *url_list, int current_depth, int max_depth, int readable, regex_t *preg)
{
  char *url_base;
  char *buf;
  char *hostname = NULL;
  char *resource;
  char *tmpstr;
  int skt,len, headers_gone, first, is_text=0;
  int httpport = 80;
  int parser_state = 0;
  int matches = 0;
  int bufsize = 512;
  regmatch_t pmatch;

  list link_list;
  struct link_list_header *header;

  link_list = links_in_url(url, *url_list);
  
  if (link_list == NULL) {
    url_base = get_url_base(url);
  
    link_list = create_list();

    geturl(url, &hostname, &resource, &httpport);
    dprintf(("connecting to %s:%d...", hostname, httpport));
    skt = opensocket(hostname, httpport);
    dprintf(("ok\n"));

    if (skt != NULL) {
      dprintf(("fetching %s...\n", url));
      
      buf = my_malloc(strlen(resource)+30);
      
      /* send message to server - note CRLF line termination */
      sprintf(buf,"GET %s HTTP/1.0\r\n\r\n",resource);
      sendstr(skt,buf);
      sprintf(buf,"\r\n");
      sendstr(skt,buf);
      
      /* read and print response */
      headers_gone = 0;
      buf=realloc(buf, bufsize);
      
      while(!recvline(skt, &buf, &len, &bufsize) && headers_gone == 0) {
	/* Check if page contains text */
	if (strncasecmp(buf, "Content-Type: text", 18) == 0)
	  is_text = 1;
	/* Check if page is redirected */
	else if (strncasecmp(buf, "Location:", 9) == 0) {
	  dprintf(("Following redirection to: %s\n",buf+10));
	  tmpstr = malloc(strlen(buf+10)+1);
	  strncpy(tmpstr, buf+10, strlen(buf+10)-1);
	  link_list = add_to_front(link_list, tmpstr);
	} else if (strncmp(buf, "\n", 1) == 0) 
	  headers_gone = 1;   
      }
      
      if (is_text == 1) {
	while (!recvline(skt, &buf, &len, &bufsize)) {
	  parser_state = getlink(buf, parser_state, &link_list, url_base, hostname);
	  if (regexec(preg, buf, 1, &pmatch, 0) == 0) {
	    if (readable == 1) {
	      if (matches == 0) {
		if (current_depth != 0)
		  printf("\n");
		printf("%s:\n",url);
	      }
	      printf("\t%s", buf);   
	    } else
	      printf("%s: %s", url, buf);  
	    matches++;
	  }
	}
      } else {
	if (readable) {
	  printf("%s: skipped (not text)\n",url);
	}
      }
     
      free(buf);
      
      close(skt);
    } else {
      if (readable) {
	printf("%s: couldn't connect \n",url);
      }
    }

    header = (struct link_list_header *)my_malloc(sizeof(struct link_list_header));
    header->url = my_malloc(strlen(url)+1);
    strcpy(header->url, url);
    header->level = current_depth;

    link_list = add_to_front(link_list, header);
    *url_list = add_to_front(*url_list, link_list);
    first = 1;

    free(url_base);
    free(hostname);
    free(resource);
  } else {
    dprintf(("link loop detected, "));
    first = 0;
  }

  header = (struct link_list_header *)link_list->data;
  link_list = link_list->next;
  
  if (header->level <= current_depth && !first) {
    dprintf(("not following to %s (shorter path already found)\n", url));
  } else {
    dprintf(("shortest path so far (length = %d) found to %s\n", current_depth, url));
    header->level = current_depth;

    if (current_depth < max_depth ) {
      while (link_list->next != NULL) {
	grep_url((char *)link_list->data, url_list, current_depth + 1, max_depth, readable, preg);
	link_list = link_list->next;
      } 
    }
  }
  
   return 0;
}

/*********************************************************************
 * function:    void usage()                                         *
 *                                                                   *
 * description: Prints the usage of the httpgrep program to stderr   * 
 *              and exits.                                           *
 *********************************************************************/

void usage()  {
  fprintf(stderr, "Usage: httpgrep [-r] -l limit regexp URL\n" 
	  "-r   more readable output (indented etc)\n"
	  "-l   maximum recursion depth\n");  
  exit(1);
}

/*********************************************************************
 * function:    int main(int argc, char **argv)                      *
 *                                                                   *
 * parameters:  argc     Number of arguments given in command line   *
 *              **argv   arguments given in command line             *
 *                                                                   *
 * description: The program opens a connection to the www server,    *
 *              asks for the html page in question using the http    *
 *              protocol and show the lines of the page that match   *
 *              the given regular expression. The program will also  *
 *              look for links in the page and process similarly     *
 *              also the pages that the links refer to until the     *
 *              given maximum depth limits the search.               *
 *                                                                   *
 * return values: Returns 0                                          *
 *********************************************************************/

int main (int argc, char **argv)
{
  int i;
  int error;
  char *regexp;
  char *url;
  char *endptr;
  char message[100];
  int max_depth = 0;
  int readable = 0;
  list url_list;
  list list_iterator, l;
  list next;
  regex_t preg;
  struct link_list_header *header;

  if (argc<5) 
    usage();

  i = 1;
  if (strncmp(argv[i], "-r", 2) == 0) {
    readable = 1;
    i++;
  }

  if (strncmp(argv[i], "-l", 2) == 0) {
    max_depth = strtol(argv[i+1], &endptr, 0);
    if (max_depth == 0 && endptr != argv[i+1]+strlen(argv[i+1])) {
      fprintf(stderr, "Invalid number `%s'\n", argv[i+1]);
      usage();
    } 
    i = i+2;
  } else 
    usage();

  if (argc-i != 2)
    usage();

  regexp = argv[i];
  i++;
  url = argv[i];

  if ((error = regcomp(&preg, regexp, REG_EXTENDED|REG_NOSUB)) != 0) {
    regerror(error, &preg, message, 100);
    printf("Invalid regular expression: %s\n", message);
    exit(1);
  }

  url_list = create_list();
  
  grep_url(url, &url_list, 0, max_depth, readable, &preg);

  dprintf(("all done\n"));

  list_iterator = url_list;

  while (list_iterator->next != NULL){ 
    l = list_iterator->data;
    header=l->data;
    dprintf(("url: %s, path-length: %d\n",header->url, header->level));
    l = l->next;
    while (l->next != NULL) {
      dprintf(("%s\n", (char*)l->data));
      l = l->next;
    }

    dprintf(("\n"));
    free(header->url);
    destroy_list(list_iterator->data);
    list_iterator = list_iterator->next;
  }
   
  while (url_list != NULL) {
    next = url_list->next;
    free(url_list);
    url_list = next;
  }

  return 0;
}
      

    

/* End of file */











