Encyclopedia of Life image scraper

For the OneZoom project, I’ve been writing some perl code to grab appropriate images from the Encyclopedia of Life. Here is the current code, warts and all. To make it look nice, you’ll need to stick it in a sensible text editor.

#!/usr/bin/perl -ws
# Given an input file of species names, prints the same number of lines to a file or to stdout, 
#  with the following tab-separated fields on each line
#      1. an EoL URL to help downloading the image or sound (or blank if no appropriate one found)
#      2. the EoL "dataObjectID", which can serve as a unique filename (or blank if no appropriate one found)
#      3. the EoL "pageID", which can serve as a unique filename (or blank if no appropriate image found)
#      4. original search term used from input file 
#      5. found taxon name 
#      6. vernacular name (default = english) 
#      7. IUCN status 
#      8. EoL spp url
#      9. item vettedStatus 
#      10. item dataRating 
#      11. item is exemplar? 
#      12. short credit line 
#      13. HTML credit line 
#      14 . long credit line(s) (each line separated by a new tab)

# The simplest use of this script is e.g. 
#      ./species2eolfiles.pl specieslist.txt ImageDetails.txt
# If you want to specify anything other than the first returned image (e.g. if you deem the first image in some cases
#  worse than the second or 3rd returned image, then you can append a tab plus a number to the appropriate line in the input 
#  file of species names (e.g. "Homo sapiens  3" to use the 3rd returned image of Homo sapiens). You then need to instruct this script
#  to select from among the first 3 images by using the -images=N command-line switch, as follows
#      ./species2eolpics.pl -images=3 specieslist.txt ImageDetails.txt
#
# To create a list of sounds, rather than images, use the -sounds or -sounds=N tag, in the same way as for -images
#      ./species2eolfiles.pl -sounds specieslist.txt SoundDetails.txt
#
# To use vernacular names in languages other than english, use the -language switch, e.g. -language=fr

#You can then download the images using the URL printed on the first field of the output file
#by using a program such as "curl".
#
# e.g. to save the file with the dataObjectID as a filename
#   perl -ne '@n=split("\t"); print "-o $n[1].$2\nurl=\"$1\"\n" if ($n[0] =~ /^(. ?\.(\w ))$/)' ImageDetails.txt | curl -K -
# or with the species page ID as the filename
#   perl -ne '@n=split("\t"); print "-o $n[2].$2\nurl=\"$1\"\n" if ($n[0] =~ /^(. ?\.(\w ))$/)' ImageDetails.txt | curl -K -
# or with the vernacular name as a prefix:
#   perl -ne '@n=split("\t"); $n[5] =~ s/\W/_/g; print "-o $n[5]=$n[1].$2\nurl=\"$1\"\n" if ($n[0] =~ /^(. ?\.(\w ))$/)' ImageDetails.txt | curl -K -
# or to get 130 x 130 thumbnails (can also use 88x88)
#   perl -ne '@n=split("\t"); $n[5] =~ s/\W/_/g; print "-o $n[5]=$n[1]$2\nurl=\"$n[0]\"\n" if ($n[0] =~ s/^(. ?)_orig(\.\w )$/$1_130_130$2/)' ImageDetails.txt | curl -K -


# or to get 130 x 130 thumbnails only for images with rating > 2.5, saved under queried species name & ID
#   perl -C -ne '@n=split("\t"); $n[5] =~ s/\W/_/g; $n[11] =~ s!./.Source:.*!!; $n[11] =~ s!/!|!; print "-o \"$n[3]=$n[1]=$n[2]=$n[11]$2\"\nurl=\"$n[0]\"\n" if (($n[0] =~ s/^(. ?)_orig(\.\w )$/$1_130_130$2/) && ($n[9] > 2.5))' ImageDetails.txt | curl -K -
# NB this is useful  awk 'NR==FNR{a[$1]=$0;next;}a[$1]{$0=a[$1]}1' OneZoom_placentals_selection.txt OneZoom_placentals_list.txt

# also useful for location data dumped from an SQL file: 
# perl -ne '@f = split("\t"); print "$f[5] $f[4]\n";' ../../Placental.dat | ./species2eolfiles.pl - ImageDetails.txt

use strict;
use warnings;
use Encode;
use URI::Escape;
use HTML::Entities;
use HTML::TagFilter;
use LWP::Simple;
use JSON -support_by_pp;
use Try::Tiny;
use Data::Dumper;
use sort 'stable';
 
my ($images, $sounds, $dataType) = (0, 0, "");

if (defined($main::images) && int($main::images)>0) {
  $images=int($main::images);
  $dataType = "http://purl.org/dc/dcmitype/StillImage";
};
if (defined($main::sounds) && int($main::sounds)>0) {
  $sounds=int($main::sounds);
  $dataType = "http://purl.org/dc/dcmitype/Sound";
};

if ($images && $sounds) {
  die "You cannot get images and sounds simultaneously";
}

unless ($images || $sounds) {
  warn "You have not specified whether you want to get images or sounds. Defaulting to images.\n";
  $images=1;
  $dataType = "http://purl.org/dc/dcmitype/StillImage";
}
 
## Default settings ##
my $APIkey="0e8786f5d94e9587e31ed0f7703c9a81f3036c7f"; #replace with your own API key.
my $vernacularLANG = 'en';
$vernacularLANG = $main::language if(defined($main::language));

my $searchbase ="http://eol.org/api/search/1.0.json"; #see http://eol.org/api/docs/search
my %search_params = (
  key   => $APIkey,
  page  => 1,
  cache_ttl => 100,
  filter_by_taxon_concept_id=>"1642", # CHANGE THIS DEPENDING ON THE SPECIES YOU ARE USING
  # Will only look for species which are descendants of this node ID (e.g. for placental mammals 
  # use filter_by_taxon_concept_id=>"2844801" , see http://eol.org/pages/2844801/overview, or for 
  #  mammals in general (more likely to appear in classifications) try filter_by_taxon_concept_id=>"1642"
  # Vertebrates is 2774383
);

my $pagesbase = "http://eol.org/api/pages/1.0/"; #see http://eol.org/api/docs/pages
my %pages_params = (
  key   => $APIkey,
  images     => $images,     # only look through the first 10 images
  videos     => 0,
  sounds     => $sounds,
  text       => 0,
  cache_ttl  => 2000, #only cache for 10 secs
  licenses   =>'all', #include cc-by, cc-by-nc, cc-by-sa, cc-by-nc-sa to get objects distributed under different licences
  iucn       => 'true',
  common_names=>'true',
  details    => 'true',
  vetted     => 2, #If 'vetted' is given a value of '1', then only trusted content will be returned. If 'vetted' is '2', then only trusted and unreviewed content will be returned (untrusted content will not be returned). The default (0) is to return all content.
);

my $CC_buttons_base = "/images/"; #to load the buttons only once, save the CC buttons (http://creativecommons.org/about/downloads) to your own server, in this directory
my %CCimages = (
  'by'=>$CC_buttons_base."cc-by-80x15.png", #only require attribution
  'by-sa' => $CC_buttons_base."cc-by-sa-80x15.png", #require share-alike
  'by-nc' => $CC_buttons_base."cc-by-nc-80x15.png", #require share-alike
  'by-nc-sa' => $CC_buttons_base."cc-by-nc-sa-80x15.png",
  'publicdomain' => $CC_buttons_base."cc-by-nc-sa-80x15.png", #EOL doesn't allow cc-zero http://creativecommons.org/publicdomain/zero/1.0/
);
  
#delete the following 5 lines if you want to use CC buttons stored on your server (recommended)
$CCimages{'by'} = ' D3LeQeGXClHj54uXyxctx RYIdu/ebWZmNrl/CoT74eOHwoLCoRLJWDy8fMkKYWFhXL6FAGBw3Lxxc8PajUD2x08fPHw8vH28h6qHL128lJ6eDmScPXtWWVkZmO6Bafj9 /dAEsgGigDFjY2NXVxcrl29BjWFiQnIRUs/EICcnDDZA /hSxcvf/70GegfIDssLAzIALoPSHZ2dgL9 e7dO2DM37t3DygLFLxz w48VQMTOWYJAQQQ78EZQBLIHkQe/vrlC5AUFBQERinQYxCfAxMwkK2kpAQUT0tLCw0NhRXxUKf//PmDkYURs5BA9h7EzwPrWywe5ubhAZJA3wL9BvQhMFYhxTKQC/QzULyiomLWrFloiZOdneP/n/94Ynjw5mE9fV1eXl6IP4ERu2fPHqCjgT4HsoGxLSQkBBSBZNczZ84oKitCdAnwCwArKvwxPLgaHsj1cF9X//Onz /fv49HGzBETExMouOiA4L9gVxZabnmhuatW7YO8noYGPpYSunouKi3b98AoxSP5rKyMhVVZYhv fkEdmzZMch9i69aArafUjNTKysr29vbsepxcna6fv16QUkBPD33T gfbUsP0rb0SOssMQAEGAB0zfzj5yHE/QAAAABJRU5ErkJggg==';
$CCimages{'by-sa'} = ' D3LeQeGXClHj54uXyxctx RYIdu/ebWZmNrl/CoT74eOHwoLCoRLJWDy8fMkKYWFhXL6FAGBw3Lxxc8PajUD2x08fPHw8vH28h6qHL128lJ6eDmScPXtWWVkZmO6Bafj9 /dAEsgGigDFjY2NXVxcrl29BjWFiQnIRUs/EICcnDDZ LXQonBhwvDt5c fPgP9A2SHhYUBGUBbgWRnZyfQn /evQPG/L1794CyQME7t /AUzUwkWOWEEAAcTScASSBbKy ZYQBZC7V/Yzu4a9fvgBJQUFBYJQCPQbxOTABA9lKSkpA8bS0tNDQUFgRD3X6z58/GFkYscYY3HsQ18O5aPEJkUVLCPSIYW4eHiAJ9C3Qb0AfAmMVUiwDuUA/A8UrKipmzZqFljjZ2Tn //mPJ4ax1hDw MSqHk2WVh7W09fl5eWF BMYsXv27AHaCvQ5kA2MbSEhIaAIJLueOXNGUVkRokuAXwBYUeGPYYJ1BknqKW14INfDfV39z58 v3//Ph5twBAxMTGJjosOCPYHcmWl5Zobmrdu2TrI62FgaGIppaPjot6 fQOMUjyay8rKVFSVIb7l5xPYsWXHIPctvmoJ2H5KzUytrKxsb2/HqsfJ2en69esFJQXw9Nw/oX 0LT1I29IjrbPEABBgAEdwNeO7yfOaAAAAAElFTkSuQmCC';
$CCimages{'by-nc'} = ' v2bQaWWLPrr0zp87s6OgoLy/H1Obq6nrq1Km27lZxCXEgV15WwdXJ9eOHj4Pct0BvAv3LhCnx8sXL5YuX4/ItEOzevdvMzGxy/xQI98PHD4UFhUMlkrF4ePmSFcLCwrh8CwHA4Lh54 aGtRuB7I fPnj4eHj7eA9VD1 6eCk9PR3IOHv2rLKyMjAZANPw /fvgSSQDRQBihsbG7u4uFy7eg1qChMTkIuWfiAAzkWWwpreMNnIJtDKw5cuXv786TPQP0B2WFgYkAG0Ekh2dnYC/fnu3TtgzN 7dw8oCxS8c/sOPFUDEzmWEoKREeJiOAOSkXDlMcwsB9dIKw9//fIFSAoKCgKjFOgxiM BCRjIVlJSAoqnpaWFhobCinio03/ /MHIwojpAWTvQZwO56IlAeRAwQw4GnqYm4cHSAJ9C/Qb0IfAWIUUy0Au0M9A8YqKilmzZqHFCTs7x/8///HEMPYagpERzTNY1dM2hvX0dXl5eSH BEbsnj17gI4A hzIBsa2kJAQUASSXc cOaOorAjRJcAvAKyo8Mcw8VUlsufJMIGohgdyPdzX1f/86fP79 /j0QYMERMTk i46IBgfyBXVlquuaF565atQ7Iejo6Levv2DTBK8WguKytTUVWG JafT2DHlh2D3Lf4qiVg yk1M7WysrK9vR2rHidnp vXrxeUFMDTc/ E/tG29CBtS4 0zhIDQIABALCdLOPlj1pIAAAAAElFTkSuQmCC';
$CCimages{'by-nc-sa'} = ' v2bQaWWLPrr0zp87s6OgoLy/H1Obq6nrq1Km27lZxCXEgV15WwdXJ9eOHj4Pct0BvAv3LhCnx8sXL5YuX4/ItEOzevdvMzGxy/xQI98PHD4UFhUMlkrF4ePmSFcLCwrh8CwHA4Lh54 aGtRuB7I fPnj4eHj7eA9VD1 6eCk9PR3IOHv2rLKyMjAZANPw /fvgSSQDRQBihsbG7u4uFy7eg1qChMTkIuWfiAAzkWWwpreMNnIJuBKpWi2ECyPmDB8e/nzp89A/wDZYWFhQAbQCCDZ2dkJ9Oe7d  AMX/v3j2gLFDwzu078FQNTORYSghGRogL4AxIRsLleswsB9eIK09CADHqsXv465cvQFJQUBAYpUCPQXwOTMBAtpKSElA8LS0tNDQUVsRDnf7z5w9GFkaswQ/3HsQpcC5a5CAHCmbAYdUCUY WNEiOYW4eHiAJ9C3Qb0AfAmMVUiwDuUA/A8UrKipmzZqFFifs7Bz///zHE8PYawhY5OD3M7IP0bSgWYFpIGEP6 nr8vLyQvwJjNg9e/YAjQD6HMgGxraQkBBQBJJdz5w5o6isCNElwC8ArKjwxzDxVSVaBOLJAmRYgaUe7uvqf/70 f379/FoA4aIiYlJdFx0QLA/kCsrLdfc0Lx1y9YhWQ9Hx0W9ffsGGKV4NJeVlamoKkN8y88nsGPLjkHuW3zVErD9lJqZWllZ2d7ejlWPk7PT9evXC0oK4Om5f0L/aFt6kLalR1pniQEgwAAqmGXUQtYz7AAAAABJRU5ErkJggg==';
$CCimages{'publicdomain'} = ' /39/fYGBgEBAQ7 /vICAgMDAwQEBAz8/PUFBQn5 fgICAcHBwj4 Pr6 v////AAAAAWAWqQAAAN9JREFUeNqsU4sOwyAIRG199DHu/792gLXNui1LptgSRDkPFMJgITAzAWXiKi4iOP5bDHCaUcTOa1kXmUUkdqGH4Q4k4s3mguzEy74HMKqhlGIIWah6eM49gMCS1J6nyapHmNXJpnRUw/5zAXcn6lwALd4Zy61ex7mbj1HVoXGFc/3auSdgqmdq1j5fgB8Y3gHbejMuhrozL7Po9LgY4hdDvGAfDEVLpoK1MVsxw1FDtOjG90sN3wAjdoHSfL1nKmr03fKOIH1S3yFZ33S w7NT/EpDOkU5 XG9PFieAgwAnEVFEgHUtZAAAAAASUVORK5CYII=';

## Program starts here ##

die("The name of a file of taxa names (or - for stdin) is required as a first argument\n") unless ($ARGV[0]);
my @taxa =(); #store all the data in here

if ($ARGV[0] eq "-") {
  *NAMES = *STDIN;
} else {
  open(NAMES, "<", $ARGV[0])
    or die "cannot open ".$ARGV[0]." for reading: $!";
}

if ($ARGV[1] && $ARGV[1] ne "-") {
  open(OUTPUT, ">:utf8", $ARGV[1])
    or die "cannot open ".$ARGV[1]." for (over)writing: $!";
} else {
  *OUTPUT = *STDOUT;
  binmode OUTPUT, ":utf8";
}

binmode STDERR, ":utf8";
select(STDERR); #for outputting progress
$|=1; #turn on autoflushing
select(OUTPUT);
$|=1; #turn on autoflushing

while(<NAMES>) {
  #get species names from first field (before tab)
  #if second field is a simple integer, try to get a different item to the first available
  chomp;
  my @fields = split("\t");
  $fields[0] =~ s/^\s //; #remove initial whitespace
  $fields[0] =~ s/\s $//; #remove terminal whitespace, including endings
  $fields[0] =~ s/[\s_] / /g; #replace whitespace and underscores with simple space
  $fields[1] = "1" unless (defined($fields[1]) && ($fields[1] =~ /^\d \s*$/));
  $fields[1] =~ s/\s $//; #remove terminal whitespace, including endings
  push(@taxa, {search=>$fields[0], objectNum=>$fields[1]});   #caution - this could be a blank line
};
close(NAMES);

$search_params{q} = "";
my $line_sep="";
foreach my $taxon (@taxa) {
  print $line_sep; $line_sep="\n"; #don't print initial newline on first iteration
  if ($taxon->{search}) {
    $search_params{q} = $taxon->{search};
    $search_params{exact} = 'true';
    my $url = $searchbase."?".join("&", map{"$_=$search_params{$_}"} keys %search_params);
    my $i=10; #try getting the search page a few times
    while (--$i) {
      my $sp = fetch_json_page($url);
      if ($sp && exists $sp->{totalResults}) {
        if ($sp->{totalResults} <= 0) {
          if ($search_params{exact} eq 'true') {
            warn "No species found in EoL for exact match against $taxon->{search} - trying a non-exact search\n";    
		    $search_params{exact} = 'false';
    		$url = $searchbase."?".join("&", map{"$_=$search_params{$_}"} keys %search_params);
    		$i = 10; #reset
    		next;
          } else {
            warn "No species found in EoL for non-exact match against $taxon->{search} - abandoning\n";    
          }
        } else {
          my $target = $sp->{results}->[-1]; #default to the last one (sometimes with the authority also listed)
          if ($sp->{totalResults} > 1) {
            my $w = "Found ".scalar($sp->{totalResults})." pages for $taxon->{search}";
            #hope there aren't more that 30 found, otherwise we are scuppered
			my @matches = grep {($_->{title} =~ /^$taxon->{search}/i)} @{$sp->{results}};
			if (scalar(@matches) == 0) {
				warn "$w, with no precise matches, defaulting to $target->{title} (id: $target->{id})\n";
			} else {
				$target = $matches[-1]; #use last precise match (usually the one with extra authority info)
                if (scalar(@matches) > 1) {
                  warn "$w, with multiple precise matches, using $target->{title} (id: $target->{id})\n";
                } else {
                  warn "$w, but only one precise match ($target->{title}, id: $target->{id})\n";
                };
			};
          };
          if ($search_params{exact} eq 'false') {
            my $w = "Using non-exact match:";
            $w .= " title='".$target->{title}."'" if ($target->{title});
            $w .= " content='".$target->{content}."'" if ($target->{content});
            warn "$w\n";      
          }
           my $id = $target->{id};
         if (my $result = getPageInfo($id, $taxon->{objectNum})) {
          	print join("\t", outputFields($taxon->{search}, $result));
          };
        };
        last;
      };
      warn "Trying search again for $url";
      sleep(2); #wait a bit and try again
    };
    warn "Could not get any results when searching EoL for $taxon->{search} (url = $url). Tried multiple times\n" unless ($i);
  };
};

sub dataObjectID2URL {
  return $_[0] ? "http://eol.org/data_objects/$_[0]" : '';	
}

sub pageID2URL {
  return $_[0] ? "http://eol.org/pages/$_[0]/overview" : '';
}

sub getPageInfo {
  my %data;
  my $id = shift;
  my $objectNum = shift;
  return \%data unless ($id);
  #hopefully these are the only 3 possible return values for vettedStatus; 
  #perl should throw an "Use of uninitialized value" error if not
  my %vettedStatus = (Trusted=>1, Unknown => 0, Unreviewed=>0, Untrusted => -1); 
    
  $data{speciesID} = $id;
  my $url = $pagesbase.$id.".json?".join("&", map{"$_=$pages_params{$_}"} keys %pages_params);

  my $i=0;
  my $pg;
  while (not($pg = fetch_json_page($url))) {
    warn "Trying to find info again for $url";
    last if (  $i==10); #try getting the page a few times
    sleep(2); #wait a bit and try again
  };
  
  unless ($pg) {
    warn "Error in getting json page result from EoL for page id $id (url = '$url'), tried $i times\n";    
  } else {
    #sort out names etc.
    if (ref($pg) ne "HASH") {
    	warn "Error in EoL json page for page id $id (url = '$url'): no labelled data\n";    
    	return(\%data);
    };
    
    $data{name} = $pg->{scientificName} if (exists($pg->{scientificName}));
    if (@{$pg->{vernacularNames}}) {
      foreach my $vn (@{$pg->{vernacularNames}}) {
        if ($vn->{language} eq $vernacularLANG) {
          if (!$data{vernacular} || ($vn->{eol_preferred} && $vn->{eol_preferred} eq 'true')) {
            $data{vernacular} = $vn; #pick the first one
          }
        }
      }
      if ($data{vernacular}) {
		unless ($data{vernacular}->{eol_preferred}) {
	      	warn "For $data{name}, defaulting to using $data{vernacular}->{vernacularName}\n\n";
		};
      	$data{vernacular} = $data{vernacular}->{vernacularName};
      };
    }

    #sort out objects (media & IUCN status)
    if (!($pg->{dataObjects}) || 0==@{$pg->{dataObjects}}) {
      #this is the most common problem - so indent the error to be able to see other errors more clearly
      #warn "     (no appropriate data objects found in EoL for $data{name} at ".pageID2URL($id)."\n";
    } else {

      #IUCN status has "dataType": "http://purl.org/dc/dcmitype/Text"
      my @IUCN = grep {($_->{dataType} =~ /Text$/) &&
                       ($_->{title} eq "IUCNConservationStatus")} @{$pg->{dataObjects}};
      if (@IUCN==0) {
        warn "     (no IUCN data for $data{name} (id $id) at ".pageID2URL($id)."\n";
      } else {
        warn "More than one IUCN source for $data{name} (id $id) at ".pageID2URL($id)."\n" if (@IUCN>1);
        $data{IUCN} = $IUCN[0]->{description};
      };

      if ($objectNum < 1) {
            warn("You asked for nothing to be returned for $data{name} (id $id)\n");
      } else {
        my @obj = grep {($_->{dataType} eq $dataType)} @{$pg->{dataObjects}};
#       Exemplar gets put first, so sometimes this is not quite in order
#        my $o1 = "@obj";
#        @obj = sort {$vettedStatus{$b->{vettedStatus}} <=> $vettedStatus{$a->{vettedStatus}} ||
#                     $b->{dataRating} <=> $a->{dataRating}} @obj; #should be able to get away with taking just the first one
#        warn("EoL is not returning the items in the documented order, for $url \n") if "@obj" ne $o1;
        if (@obj == 0) {
          #  warn "      (no valid items found in EoL for $data{name} at ".pageID2URL($id)."\n";
        } else {
          if ($objectNum > @obj) {
            warn("You asked for item number $objectNum for $data{name} (id $id) but there ",
            scalar(@obj)==1?"is only one item":"are only ".scalar(@obj)." items",
            ": will use the last one.\n");
            $objectNum = scalar(@obj);
          }
          my $obj = $obj[$objectNum-1];
          $data{vettedStatus}=$obj->{vettedStatus};
          $data{dataRating}=$obj->{dataRating};
          $data{original} = $obj->{source} || $obj->{mediaURL}; #e.g. http://commons.wikimedia.org/wiki/File:Masai_Woman.jpg
          $data{licence} = $obj->{license};
          $data{eolMediaURL} = $obj->{eolMediaURL}; # e.g. 'http://media.eol.org/content/2012/06/12/20/17507_orig.jpg',
          $data{mediaURL} = $obj->{mediaURL}; #e.g. 'http://upload.wikimedia.org/wikipedia/commons/a/ac/Masai_Woman.jpg'
          $data{dataObjectVersionID} = $obj->{dataObjectVersionID};
          $data{rightsHolder} = $obj->{rightsHolder};
          foreach my $agent (@{$obj->{agents}}) {
            if ($agent->{role}) {
              if ($agent->{role} =~ /^(photographer|fot.grafo|fotograaf|fotograf|creator|illustrator)$/) {
                $data{itemBy} = trimname($agent->{full_name});
                $data{itemByURL} = $agent->{homepage};
              } elsif ($agent->{role} =~ /^(provider|anbieter)$/) {
                $data{itemFrom} = $agent->{full_name};
                $data{itemFromURL} = $agent->{homepage};        
              } elsif ($agent->{role} eq 'publisher') {
                #ignore this role
              } else {
                warn "Unknown role: '$agent->{role}' on page id $id (".pageID2URL($id).") for data object $data{dataObjectVersionID}\n";
              }
            } else {
                warn "No role found on page id $id (".pageID2URL($id).") for data object $data{dataObjectVersionID}\n";
            }
          };
          if (($obj->{created}) && ($obj->{created} =~ m/^(\d\d\d\d)/)) {
            $data{year} = $1;
          };
        };
      };
    };
  };
  return(\%data);	
}

sub outputFields {
  my $filter_all =  HTML::TagFilter->new(allow=>{}); #delete everything.
  my $filter_most = new HTML::TagFilter; #defaults allow text markup, links, & images
  $filter_most->deny_tags({ img => { all => [] }});
  my $searchterm = shift;
  my $d = shift;

  my $licences = mklicence($d->{licence});
  my $lic;

  my @short = map {$filter_most->filter($_)} makeShortText($d);
  my @HTML  = map {$filter_most->filter($_)} makeHTML($d);
  my @long  = map {$filter_all->filter($_)} makeLongText($d);

  splice(@short, scalar(@short) ? 1 : 0, 0, $lic) if ($lic=$licences->{short}); #insert licence into middle
  unshift(@HTML, $lic) if ($lic = $licences->{HTML});
  unshift(@long, $lic) if ($lic = $licences->{long});
  
  s/\s / / foreach (@short);
  s/\s / / foreach (@HTML);
  s/\s / / foreach (@long);
    
  return(
    $d->{eolMediaURL} || '',    # 1. an EoL URL to help downloading the media file (or blank if none found)
    # either of next 2 fields can serve as a unique filename (blank if none found)
    $d->{dataObjectVersionID} || '',  # 2. the EoL "dataObjectID"
    $d->{speciesID} || '',      # 3. the EoL "page ID"
    $searchterm,                # 4. original search term used from input file 
    $d->{name} || '',           # 5. found taxon name
    $d->{vernacular} || '',     # 6. vernacular name (english)
    $d->{IUCN} || '',           # 7. IUCN status
    ($d->{speciesID})?pageID2URL($d->{speciesID}):"",# 8. EoL spp url
    $d->{vettedStatus} || '',   # 9. object vettedStatus 
    $d->{dataRating} || '',     # 10. object quality score (unscored = 2.5) 
    '',       # 11. object exemplar status //needs API to support exemplar.
    join("\x{2009}/\x{2009}", @short), # 12. short credit line 
    join("<br />",@HTML),       # 13. HTML credit line 
    join("\t", @long)           # 14 . long credit line(s) (each line separated by a new tab)
  )
}

sub mklicence {
  #examples of URLs passed in:
  #  http://creativecommons.org/licenses/by-nc-sa/3.0/
  # becomes
  #   short => CreativeCommons BY-NC-SA 3.0
  #   HTML  => <a href='http://creativecommons.org/licenses/by-nc-sa/3.0/'><img src='myNCSAimage' alt="CC BY-NC-SA 3.0"/></a>
  #   long  => CC BY-NC-SA 3.0 (http://creativecommons.org/licenses/by-nc-sa/3.0/)
  #Whereas
  #  http://creativecommons.org/licenses/publicdomain/
  # becomes
  #   short => Public Domain
  #   HTML  => <a href='http://creativecommons.org/licenses/publicdomain/'><img src='myPDimage' alt="Public Domain"/></a>
  #   long  => Public Domain (e.g. http://creativecommons.org/licenses/publicdomain/)
  #Other urls remain unchanged:
  #  http://foo/bar/
  # becomes
  #   short => http://foo/bar
  #   HTML  => <a href='http://foo/bar'>http://foo/bar/</a> (if starting with http://)
  #   long  => http://foo/bar
  my $in = $_[0];
  my %strings = (short=>$in, HTML =>$in, long =>$in);
  if ($in) {
    $strings{HTML} = ($in =~ m|^https?://|) ? "<a href='$in'>$in</a>" : $in; #default

    if (my ($desc) = ($in =~ m|http://creativecommons.org/licenses/(.*?)/?$|)) {
      if ($desc =~ m|(publicdomain)|) {
        my $type=$1;
        $strings{short} = "Public Domain";
        $strings{HTML} = "<a href='$in'><img src='".$CCimages{$type}."' alt='Public Domain' /></a>";
        $strings{long} = "Public Domain (e.g. $in)";
      } elsif ($desc =~ s|([^/]*)/(.*)|\U$1 $2|) { #by-nc-sa/3.0 -> BY-NC-SA 3.0
        my $type=$1;
        $strings{short} = "CreativeCommons $desc";
        if (exists $CCimages{$type}) {
          $strings{HTML} = "<a href='$in'><img src='".$CCimages{$type}."' alt='CC $desc' /></a>";
        } else {
          warn "couldn't find key $type in \$CCimages\n";
          $strings{HTML} = "<a href='$in'>CC $desc</a>";        
        }
        $strings{long} = "CC $desc ($in)";
      }
    }
  }
  return \%strings;
}

sub makeShortText {
  # Short text form from EOL is e.g. 
  #    © $rightsHolder ($rights) / Source: $provider via $EoL_URL
  # or if $rightsHolder missing,
  #    $photographer (year) / Source: $provider via $EoL_URL
  my $data = shift;
  my @content = ();

  if (my $objURL = dataObjectID2URL($data->{dataObjectVersionID})) {
  	#always need a dataObjectVersionID if we are to return anything
    if ($data->{rightsHolder} || $data->{rights}) {
      if ($data->{rightsHolder}) {
        $content[0] = Encode::decode_utf8("© ").$data->{rightsHolder};
        $content[0] .= " ($data->{rights})"if $data->{rights};
      } else {
        $content[0] = $data->{rights};
      }
    } elsif ($data->{itemBy}) {
      $content[0] = $data->{itemBy};
      $content[0] .= " ($data->{year})"if $data->{year};
    } else {
	  warn "Cannot find an attributor for ".dataObjectID2URL($data->{dataObjectVersionID}).", using 'Author unclear'\n";
   	 #no rightsHolder or itemBy. Hmm.
      $content[0] = "Author unclear: see EoL URL";
    };
  
    $content[1] = "Source: ";
    if ($data->{itemFrom}) {
      $content[1] .= "$data->{itemFrom} via ";
    } elsif ($data->{original}) {
      $content[1] .= "$data->{original} via ";
    }
    $content[1] .= "$objURL";
  }
  return(@content);
}

sub makeHTML {
  # WARNING - HTML links could point anywhere - even to dangerous sites. HTML form copied from EoL, for example:
  # <a href="">$photographer</a> ($year)<br />&copy; $rightsHolder ($rights)<br />From <a href=""></a> via <a href="">Encyclopedia of Life</a>
  my $data = shift;
  my @content = ();

  if (my $objURL = dataObjectID2URL($data->{dataObjectVersionID})) {  
    if ($data->{itemBy}) {
      if ($data->{itemByURL}) {
  	    push @content, "By <a href='$data->{itemByURL}'>".encode_entities($data->{itemBy})."</a>";
   	  } else {
   	    push @content, encode_entities("$data->{itemBy}");
      }
      $content[-1] .= " ($data->{year})" if $data->{year};
    }
    if ($data->{rightsHolder} || $data->{rights}) {
      if ($data->{rightsHolder}) {
        push @content, '&copy; '.$data->{rightsHolder};
        $content[-1] .= " ($data->{rights})" if $data->{rights};
      } else {
        push @content, $data->{rights};
      }
    }
  
    if ($data->{original}) {
      push @content, "<a href='$data->{original}'>Original</a> from ";
    } else {
	  push @content, "Original from ";
    }  	
    if ($data->{itemFrom}) {
      if ($data->{itemFromURL}) {
        $content[-1] .= "<a href='$data->{itemFromURL}'>$data->{itemFrom}</a> via ";
      } else {
        $content[-1] .= "$data->{itemFrom} via ";
      }
    }
    $content[-1] .= "<a href='$objURL'>$objURL</a>";
  }
  return(@content);
}

sub makeLongText {
  #Long text form is essentially a text equivalent of the HTML form, lines separated by tabs, example
  #    $photographer ($year)
  #    © $rightsHolder ($rights)
  #    From $provider via Encyclopedia of Life ($EoL_URL)
  # the URLs of the source and 
  my $data = shift;
  my @content= ();
  
  if ($data->{itemBy}) {
    push @content, "$data->{itemBy}";
    $content[-1] .= " ($data->{year})" if $data->{year};
  }
  if ($data->{rightsHolder} || $data->{rights}) {
    if ($data->{rightsHolder}) {
      push @content, Encode::decode_utf8("© ").$data->{rightsHolder};
      $content[-1] .= " ($data->{rights})" if $data->{rights};
    } else {
      push @content, $data->{rights};
    }
  }

  if (my $objURL = dataObjectID2URL($data->{dataObjectVersionID})) {  
    push @content, "From ";
    if ($data->{itemFrom}) {
      $content[-1] .= "$data->{itemFrom} via ";
    } elsif ($data->{original}) {
      if ($data->{original} =~ m|(http://[^/] )|) {
        $content[-1] .= "$1 via ";
      } else {
        $content[-1] .= "$data->{original} via ";
      }
    }
    $content[-1] .= "$objURL";
  };
  return(@content);
}

sub fetch_json_page
{
  my $json = new JSON;
  my ($json_url) = shift;
  # download the json page:
  my $json_text;
  my $content = get( $json_url );
  if (defined($content)) {
    try {
    # these are some nice json options to relax restrictions a bit:
      $json_text=$json->allow_nonref->utf8->relaxed->escape_slash->loose->allow_singlequote->allow_barekey->decode($content);
    } catch {
      warn "In string \"$content\" - JSON error: $_ \n";
    };
  };
  return $json_text;
}

sub trimname {
  #take an author name and make it nicer
  my $name = decode_entities(shift);
  $name =~ s/^\s*creator:?//i; #common idiom to start wikicommons authors with e.g. Creator: Joseph Smit
  $name =~ s/^\s*//;
  $name =~ s/\s*$//;
  $name =~ s/^(?:<. ?>)*unknown(?:<. ?>)*$//i; #strip "unknown" and tags
  return $name;
}

2 thoughts on “Encyclopedia of Life image scraper

  1. Pingback: Spotting maps among images of organisms | A Scientific View

  2. Pingback: Schrëwdinger and her descendants | A Scientific View

Leave a Reply

Your email address will not be published. Required fields are marked *