2009-11-20 00:22:53 +01:00
// genericImageParser.java
2009-10-20 00:34:44 +02:00
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 16.10.2009 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
2010-12-27 18:07:21 +01:00
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
2009-10-20 00:34:44 +02:00
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.document.parser.images ;
import java.awt.image.BufferedImage ;
2010-07-27 09:13:15 +02:00
import java.io.ByteArrayInputStream ;
2009-10-20 00:34:44 +02:00
import java.io.EOFException ;
2010-01-19 15:59:58 +01:00
import java.io.File ;
import java.io.FileInputStream ;
import java.io.FileNotFoundException ;
2009-10-20 00:34:44 +02:00
import java.io.IOException ;
2010-01-19 15:59:58 +01:00
import java.io.InputStream ; import java.net.MalformedURLException ;
import java.util.HashMap ;
2009-10-20 00:34:44 +02:00
import java.util.HashSet ;
2010-07-27 09:13:15 +02:00
import java.util.Iterator ;
2009-10-20 00:34:44 +02:00
import java.util.Set ;
import javax.imageio.ImageIO ;
2010-07-27 09:13:15 +02:00
import com.drew.imaging.jpeg.JpegProcessingException ;
import com.drew.imaging.jpeg.JpegSegmentReader ;
import com.drew.metadata.Directory ;
import com.drew.metadata.Metadata ;
import com.drew.metadata.MetadataException ;
import com.drew.metadata.Tag ;
import com.drew.metadata.exif.ExifReader ;
import com.drew.metadata.iptc.IptcReader ;
2010-05-25 14:54:57 +02:00
import net.yacy.cora.document.MultiProtocolURI ;
2009-10-20 00:34:44 +02:00
import net.yacy.document.AbstractParser ;
import net.yacy.document.Document ;
2010-06-29 21:20:45 +02:00
import net.yacy.document.Parser ;
2009-10-20 00:34:44 +02:00
import net.yacy.document.parser.html.ImageEntry ;
2010-03-12 13:23:38 +01:00
import net.yacy.document.parser.images.bmpParser.IMAGEMAP ;
2010-01-17 01:41:50 +01:00
import net.yacy.kelondro.logging.Log ;
2010-03-12 13:23:38 +01:00
import net.yacy.kelondro.util.FileUtils ;
2009-10-20 00:34:44 +02:00
2010-06-29 21:20:45 +02:00
public class genericImageParser extends AbstractParser implements Parser {
2009-10-20 00:34:44 +02:00
/ * *
* a list of mime types that are supported by this parser class
* @see # getSupportedMimeTypes ( )
* /
public static final Set < String > SUPPORTED_MIME_TYPES = new HashSet < String > ( ) ;
public static final Set < String > SUPPORTED_EXTENSIONS = new HashSet < String > ( ) ;
static {
SUPPORTED_EXTENSIONS . add ( " png " ) ;
SUPPORTED_EXTENSIONS . add ( " gif " ) ;
SUPPORTED_EXTENSIONS . add ( " jpg " ) ;
SUPPORTED_EXTENSIONS . add ( " jpeg " ) ;
SUPPORTED_EXTENSIONS . add ( " jpe " ) ;
2010-03-12 13:23:38 +01:00
SUPPORTED_EXTENSIONS . add ( " bmp " ) ;
2009-10-20 00:34:44 +02:00
SUPPORTED_MIME_TYPES . add ( " image/png " ) ;
SUPPORTED_MIME_TYPES . add ( " image/gif " ) ;
2010-10-27 23:53:11 +02:00
SUPPORTED_MIME_TYPES . add ( " image/jpeg " ) ;
SUPPORTED_MIME_TYPES . add ( " image/jpg " ) ; // this is in fact a 'wrong' mime type. We leave it here because that is a common error that occurs in the internet frequently
2010-03-12 13:23:38 +01:00
SUPPORTED_MIME_TYPES . add ( " image/bmp " ) ;
2009-10-20 00:34:44 +02:00
}
public genericImageParser ( ) {
super ( " Generic Image Parser " ) ;
}
2010-06-29 21:20:45 +02:00
public Document [ ] parse (
2010-05-25 14:54:57 +02:00
final MultiProtocolURI location ,
2009-10-20 00:34:44 +02:00
final String mimeType ,
final String documentCharset ,
2010-06-29 21:20:45 +02:00
final InputStream sourceStream ) throws Parser . Failure , InterruptedException {
2010-03-12 13:23:38 +01:00
ImageInfo ii = null ;
String title = null ;
String author = null ;
String keywords = null ;
String description = null ;
if ( mimeType . equals ( " image/bmp " ) | |
location . getFileExtension ( ) . equals ( " bmp " ) ) {
byte [ ] b ;
try {
b = FileUtils . read ( sourceStream ) ;
} catch ( IOException e ) {
Log . logException ( e ) ;
2010-06-29 21:20:45 +02:00
throw new Parser . Failure ( e . getMessage ( ) , location ) ;
2010-03-12 13:23:38 +01:00
}
IMAGEMAP imap = bmpParser . parse ( b ) ;
ii = parseJavaImage ( location , imap . getImage ( ) ) ;
} else if ( mimeType . equals ( " image/jpg " ) | |
location . getFileExtension ( ) . equals ( " jpg " ) | |
location . getFileExtension ( ) . equals ( " jpeg " ) | |
location . getFileExtension ( ) . equals ( " jpe " ) ) {
// use the exif parser from
// http://www.drewnoakes.com/drewnoakes.com/code/exif/
// javadoc is at: http://www.drewnoakes.com/drewnoakes.com/code/exif/javadoc/
// a tutorial is at: http://www.drewnoakes.com/drewnoakes.com/code/exif/sampleUsage.html
2010-07-27 09:13:15 +02:00
byte [ ] b ;
2010-03-12 13:23:38 +01:00
try {
2010-07-27 09:13:15 +02:00
b = FileUtils . read ( sourceStream ) ;
2010-03-12 13:23:38 +01:00
} catch ( IOException e ) {
2010-07-27 09:13:15 +02:00
Log . logException ( e ) ;
2010-06-29 21:20:45 +02:00
throw new Parser . Failure ( e . getMessage ( ) , location ) ;
2010-03-12 13:23:38 +01:00
}
2010-07-27 09:13:15 +02:00
ii = parseJavaImage ( location , new ByteArrayInputStream ( b ) ) ;
JpegSegmentReader segmentReader ;
try {
segmentReader = new JpegSegmentReader ( new ByteArrayInputStream ( b ) ) ;
byte [ ] exifSegment = segmentReader . readSegment ( JpegSegmentReader . SEGMENT_APP1 ) ;
byte [ ] iptcSegment = segmentReader . readSegment ( JpegSegmentReader . SEGMENT_APPD ) ;
Metadata metadata = new Metadata ( ) ;
new ExifReader ( exifSegment ) . extract ( metadata ) ;
new IptcReader ( iptcSegment ) . extract ( metadata ) ;
@SuppressWarnings ( " unchecked " )
Iterator < Directory > directories = metadata . getDirectoryIterator ( ) ;
HashMap < String , String > props = new HashMap < String , String > ( ) ;
while ( directories . hasNext ( ) ) {
Directory directory = directories . next ( ) ;
@SuppressWarnings ( " unchecked " )
Iterator < Tag > tags = directory . getTagIterator ( ) ;
while ( tags . hasNext ( ) ) {
Tag tag = tags . next ( ) ;
try {
props . put ( tag . getTagName ( ) , tag . getDescription ( ) ) ;
ii . info . append ( tag . getTagName ( ) + " : " + tag . getDescription ( ) + " . \ n " ) ;
} catch ( MetadataException e ) {
2010-09-21 23:48:42 +02:00
//Log.logException(e);
2010-07-27 09:13:15 +02:00
}
}
title = props . get ( " Image Description " ) ;
if ( title = = null | | title . length ( ) = = 0 ) title = props . get ( " Headline " ) ;
if ( title = = null | | title . length ( ) = = 0 ) title = props . get ( " Object Name " ) ;
author = props . get ( " Artist " ) ;
if ( author = = null | | author . length ( ) = = 0 ) author = props . get ( " Writer/Editor " ) ;
if ( author = = null | | author . length ( ) = = 0 ) author = props . get ( " By-line " ) ;
if ( author = = null | | author . length ( ) = = 0 ) author = props . get ( " Credit " ) ;
if ( author = = null | | author . length ( ) = = 0 ) author = props . get ( " Make " ) ;
keywords = props . get ( " Keywords " ) ;
if ( keywords = = null | | keywords . length ( ) = = 0 ) keywords = props . get ( " Category " ) ;
if ( keywords = = null | | keywords . length ( ) = = 0 ) keywords = props . get ( " Supplemental Category(s) " ) ;
description = props . get ( " Caption/Abstract " ) ;
if ( description = = null | | description . length ( ) = = 0 ) description = props . get ( " Country/Primary Location " ) ;
if ( description = = null | | description . length ( ) = = 0 ) description = props . get ( " Province/State " ) ;
if ( description = = null | | description . length ( ) = = 0 ) description = props . get ( " Copyright Notice " ) ;
}
} catch ( JpegProcessingException e ) {
Log . logException ( e ) ;
// just ignore
}
2010-03-12 13:23:38 +01:00
} else {
ii = parseJavaImage ( location , sourceStream ) ;
}
final HashSet < String > languages = new HashSet < String > ( ) ;
2010-05-25 14:54:57 +02:00
final HashMap < MultiProtocolURI , String > anchors = new HashMap < MultiProtocolURI , String > ( ) ;
final HashMap < MultiProtocolURI , ImageEntry > images = new HashMap < MultiProtocolURI , ImageEntry > ( ) ;
2010-03-12 13:23:38 +01:00
// add this image to the map of images
String infoString = ii . info . toString ( ) ;
2010-05-25 14:54:57 +02:00
images . put ( ii . location , new ImageEntry ( location , " " , ii . width , ii . height , - 1 ) ) ;
2010-03-12 13:23:38 +01:00
2010-12-17 01:52:24 +01:00
if ( title = = null | | title . length ( ) = = 0 ) title = MultiProtocolURI . unescape ( location . getFileName ( ) ) ;
2010-03-12 13:23:38 +01:00
2010-06-29 21:20:45 +02:00
return new Document [ ] { new Document (
2010-03-12 13:23:38 +01:00
location ,
mimeType ,
" UTF-8 " ,
languages ,
keywords = = null ? new String [ ] { } : keywords . split ( keywords . indexOf ( ',' ) > 0 ? " , " : " " ) , // keywords
title , // title
2010-05-11 13:14:05 +02:00
author = = null ? " " : author , // author
location . getHost ( ) , // Publisher
2010-03-12 13:23:38 +01:00
new String [ ] { } , // sections
description = = null ? " " : description , // description
infoString . getBytes ( ) , // content text
anchors , // anchors
2010-08-25 20:24:54 +02:00
null ,
2010-03-12 13:23:38 +01:00
images ,
2010-06-29 21:20:45 +02:00
false ) } ; // images
2010-03-12 13:23:38 +01:00
}
public Set < String > supportedMimeTypes ( ) {
return SUPPORTED_MIME_TYPES ;
}
public Set < String > supportedExtensions ( ) {
return SUPPORTED_EXTENSIONS ;
}
public static ImageInfo parseJavaImage (
2010-05-25 14:54:57 +02:00
final MultiProtocolURI location ,
2010-06-29 21:20:45 +02:00
final InputStream sourceStream ) throws Parser . Failure {
2009-10-20 00:34:44 +02:00
BufferedImage image = null ;
try {
2010-03-26 11:43:31 +01:00
ImageIO . setUseCache ( false ) ; // do not write a cache to disc; keep in RAM
2009-10-20 00:34:44 +02:00
image = ImageIO . read ( sourceStream ) ;
} catch ( final EOFException e ) {
2010-01-17 01:41:50 +01:00
Log . logException ( e ) ;
2010-06-29 21:20:45 +02:00
throw new Parser . Failure ( e . getMessage ( ) , location ) ;
2009-10-20 00:34:44 +02:00
} catch ( final IOException e ) {
2010-01-17 01:41:50 +01:00
Log . logException ( e ) ;
2010-06-29 21:20:45 +02:00
throw new Parser . Failure ( e . getMessage ( ) , location ) ;
2009-10-20 00:34:44 +02:00
}
2010-06-29 21:20:45 +02:00
if ( image = = null ) throw new Parser . Failure ( " ImageIO returned NULL " , location ) ;
2010-03-12 13:23:38 +01:00
return parseJavaImage ( location , image ) ;
}
public static ImageInfo parseJavaImage (
2010-05-25 14:54:57 +02:00
final MultiProtocolURI location ,
2010-04-27 23:47:41 +02:00
final BufferedImage image ) {
2010-03-12 13:23:38 +01:00
ImageInfo ii = new ImageInfo ( location ) ;
ii . image = image ;
2009-10-20 00:34:44 +02:00
// scan the image
2010-03-12 13:23:38 +01:00
ii . height = ii . image . getHeight ( ) ;
ii . width = ii . image . getWidth ( ) ;
2009-11-20 00:22:53 +01:00
/ *
2009-10-20 00:34:44 +02:00
Raster raster = image . getData ( ) ;
int [ ] pixel = raster . getPixel ( 0 , 0 , ( int [ ] ) null ) ;
2009-10-23 00:38:04 +02:00
long [ ] average = new long [ pixel . length ] ;
for ( int i = 0 ; i < average . length ; i + + ) average [ i ] = 0L ;
2009-10-20 00:34:44 +02:00
int pc = 0 ;
for ( int x = width / 4 ; x < 3 * width / 4 ; x = x + 2 ) {
for ( int y = height / 4 ; y < 3 * height / 4 ; y = y + 2 ) {
pixel = raster . getPixel ( x , y , pixel ) ;
2009-10-23 00:38:04 +02:00
for ( int i = 0 ; i < average . length ; i + + ) average [ i ] + = pixel [ i ] ;
2009-10-20 00:34:44 +02:00
pc + + ;
}
}
2009-10-23 00:38:04 +02:00
* /
2009-10-20 00:34:44 +02:00
// get image properties
2010-03-12 13:23:38 +01:00
String [ ] propNames = ii . image . getPropertyNames ( ) ;
2009-10-23 00:38:04 +02:00
if ( propNames = = null ) propNames = new String [ 0 ] ;
2010-03-12 13:23:38 +01:00
ii . info . append ( " \ n " ) ;
2009-10-20 00:34:44 +02:00
for ( String propName : propNames ) {
2010-03-12 13:23:38 +01:00
ii . info . append ( propName ) . append ( " = " ) . append ( ii . image . getProperty ( propName ) ) . append ( " . \ n " ) ;
2009-10-20 00:34:44 +02:00
}
2010-01-19 15:59:58 +01:00
// append also properties that we measured
2010-03-12 13:23:38 +01:00
ii . info . append ( " width " ) . append ( " : " ) . append ( Integer . toString ( ii . width ) ) . append ( " . \ n " ) ;
ii . info . append ( " height " ) . append ( " : " ) . append ( Integer . toString ( ii . height ) ) . append ( " . \ n " ) ;
2009-10-20 00:34:44 +02:00
2010-03-12 13:23:38 +01:00
return ii ;
2009-10-20 00:34:44 +02:00
}
2010-03-12 13:23:38 +01:00
public static class ImageInfo {
2010-05-25 14:54:57 +02:00
public MultiProtocolURI location ;
2010-03-12 13:23:38 +01:00
public BufferedImage image ;
public StringBuilder info ;
public int height ;
public int width ;
2010-05-25 14:54:57 +02:00
public ImageInfo ( final MultiProtocolURI location ) {
2010-03-12 13:23:38 +01:00
this . location = location ;
this . image = null ;
this . info = new StringBuilder ( ) ;
this . height = - 1 ;
this . width = - 1 ;
}
2009-10-20 00:34:44 +02:00
}
2010-03-12 13:23:38 +01:00
2010-01-19 15:59:58 +01:00
public static void main ( final String [ ] args ) {
File image = new File ( args [ 0 ] ) ;
genericImageParser parser = new genericImageParser ( ) ;
2010-05-25 14:54:57 +02:00
MultiProtocolURI uri ;
2010-01-19 15:59:58 +01:00
try {
2010-05-25 14:54:57 +02:00
uri = new MultiProtocolURI ( " http://localhost/ " + image . getName ( ) ) ;
2010-06-29 21:20:45 +02:00
Document [ ] document = parser . parse ( uri , " image/ " + uri . getFileExtension ( ) , " UTF-8 " , new FileInputStream ( image ) ) ;
System . out . println ( document [ 0 ] . toString ( ) ) ;
2010-01-19 15:59:58 +01:00
} catch ( MalformedURLException e ) {
e . printStackTrace ( ) ;
} catch ( FileNotFoundException e ) {
e . printStackTrace ( ) ;
2010-06-29 21:20:45 +02:00
} catch ( Parser . Failure e ) {
2010-01-19 15:59:58 +01:00
e . printStackTrace ( ) ;
} catch ( InterruptedException e ) {
e . printStackTrace ( ) ;
}
}
2009-10-20 00:34:44 +02:00
}