|
List Info
Thread: writing a new parse-exe plugin
|
|
| writing a new parse-exe plugin |

|
2007-10-17 08:53:54 |
Hi all,
I'm trying to write a new plugin that will download pages
with contentType:
x-dosexec (EXE) files.
i've followed the "write your own plugin tutorial"
in the wiki and done the
following actions: (some actions are not mentioned in the
tutorial)
1. Created a new dir under
$NUTCH_HOME/src/plugins/parse-exe
2. Created new
$NUTCH_HOME/src/plugins/parse-exe/plugin.xml [displayed
below]
3. Created new
$NUTCH_HOME/src/plugins/parse-exe/build.xml [displayed
below]
4. Written the java code
$NUTCH_HOME/src/plugin/parse-exe/src/java/org/apache/nutch/p
arse/exe/ExeParser.java
5. Add "nutch-extensionpoints" &
"parse-exe" to the 'plugins-include'
property in $NUTCH_HOME/conf/nutch-site.xml
6. Add code to the $NUTCH_HOME/conf/parse-plugins.xml
[written below]
7. Added code the the $NUTCH_HOME/src/plugins/build.xml
[written
below]
8. copied
$NUTCH_HOME/build/plugins/parse-exe/parse-exe.jar to
$NUTCH_HOME/plugins/parse-exe
9. run ant (build successful)
the log shows that nutch identifies the plugin:
2007-10-17 15:15:55,657 INFO plugin.PluginRepository -
Registered Plugins:
2007-10-17 15:15:55,657 INFO plugin.PluginRepository -
the nutch
core extension points (nutch-extensionpoints)
2007-10-17 15:15:55,657 INFO plugin.PluginRepository -
Html Parse
Plug-in (parse-html)
2007-10-17 15:15:55,657 INFO plugin.PluginRepository -
Exe Parse
Plug-in (parse-exe)
but when the fetcher encounters a x-dosexec file it thorws
an exception:
2007-10-17 15:17:16,146 WARN parse.ParseUtil - No suitable
parser found
when trying to parse content http://www.foo.com/yyy
/foo.exe of type
application/x-dosexec
2007-10-17 15:17:16,146 WARN fetcher.Fetcher - Error
parsing:
http://www.foo.com
/yyy/movie30.exe
(sorry, but the url has been masked for security reasons)
Am i missing something??
thanks !!
[$NUTCH_HOME/src/plugins/build.xml]
<ant dir="parse-exe"
target="deploy"/>
[parse-plugins.xml]
<mimeType name="application/x-dosexec">
<plugin id="parse-exe" />
</mimeType>
[plugin.xml] // copied and changed from parse-pdf
<?xml version="1.0"
encoding="UTF-8"?>
<plugin
id="parse-exe"
name="Exe Parse Plug-in"
version="1.0.0"
provider-name="nutch.org">
<runtime>
<library name="parse-exe.jar">
<export name="*"/>
</library>
</runtime>
<requires>
<import
plugin="nutch-extensionpoints"/>
<import plugin="lib-log4j"/>
</requires>
<extension id="org.apache.nutch.parse.exe"
name="ExeParse"
point="org.apache.nutch.parse.Parser">
<implementation
id="org.apache.nutch.parse.exe.ExeParse"
class="org.apache.nutch.parse.exe.ExeParse">
<parameter name="contentType"
value="application/x-dosexec"/>
<parameter name="pathSuffix"
value=""/>
</implementation>
</extension>
</plugin>
------------------------------------------------------------
-----------------------------------------------------
[build.xml]
<?xml version="1.0"?>
<project name="parse-exe"
default="jar-core">
<import file="../build-plugin.xml"/>
</project>
------------------------------------------------------------
------------
[ExeParser.java]
public class ExeParser implements Parser {
public static final Log LOG = LogFactory.getLog("
org.apache.nutch.parse.exe");
private Configuration conf;
public Parse getParse(Content content) {
try {
byte[] raw = content.getContent();
// enter here my code ( i will replace this with real
code)
LOG.info ("EDRI:: you have reached the parse-exe
plugin!");
System.out.println("EDRI:: system.out.print...
parse-exe");
String contentLength = content.getMetadata().get(
Response.CONTENT_LENGTH);
if (contentLength != null && raw.length !=
Integer.parseInt(contentLength))
{
return new ParseStatus(ParseStatus.FAILED,
ParseStatus.FAILED_TRUNCATED,
"Content truncated at
"+raw.length
+" bytes. Parser can't handle incomplete
exe
file.").getEmptyParse(getConf());
}
} catch (Exception e) { // run time exception
if (LOG.isWarnEnabled()) {
LOG.warn("General exception in EXE parser:
"+e.getMessage());
e.printStackTrace(LogUtil.getWarnStream(LOG));
}
return new ParseStatus(ParseStatus.FAILED,
"Can't be handled as exe document. "
+
e).getEmptyParse(getConf());
}
/// i'm not sure what to return here if i only need to
d/l the file
ParseData parseData = new
ParseData(ParseStatus.STATUS_SUCCESS, "",null,
null, null);
parseData.setConf(this.conf);
return new ParseImpl("", parseData);
}
public void setConf(Configuration conf) {
this.conf = conf;
}
public Configuration getConf() {
return this.conf;
}
--
Eyal Edri
|
|
| Re: writing a new parse-exe plugin |
  United States |
2007-10-17 15:50:47 |
Hi Eyal,
Did you also modify parse-plugins.xml at the bottom to add
an alias for
parse-exe to point to the actual extension point id? I'm
guessing that's
your problem. Check out the bottom of parse-plugins.xml for
an example of
this.
Let me know if you still need more help and we'll go from
there.
Thanks,
Chris
On 10/17/07 6:53 AM, "eyal edri" <eyal.edri gmail.com> wrote:
> Hi all,
>
> I'm trying to write a new plugin that will download
pages with contentType:
> x-dosexec (EXE) files.
> i've followed the "write your own plugin
tutorial" in the wiki and done the
> following actions: (some actions are not mentioned in
the tutorial)
>
> 1. Created a new dir under
$NUTCH_HOME/src/plugins/parse-exe
> 2. Created new
$NUTCH_HOME/src/plugins/parse-exe/plugin.xml [displayed
> below]
> 3. Created new
$NUTCH_HOME/src/plugins/parse-exe/build.xml [displayed
> below]
> 4. Written the java code
>
>
$NUTCH_HOME/src/plugin/parse-exe/src/java/org/apache/nutch/p
arse/exe/ExeParser
> .java
> 5. Add "nutch-extensionpoints" &
"parse-exe" to the 'plugins-include'
> property in $NUTCH_HOME/conf/nutch-site.xml
> 6. Add code to the
$NUTCH_HOME/conf/parse-plugins.xml [written below]
> 7. Added code the the
$NUTCH_HOME/src/plugins/build.xml [written
> below]
> 8. copied
$NUTCH_HOME/build/plugins/parse-exe/parse-exe.jar to
> $NUTCH_HOME/plugins/parse-exe
> 9. run ant (build successful)
>
> the log shows that nutch identifies the plugin:
>
> 2007-10-17 15:15:55,657 INFO plugin.PluginRepository -
Registered Plugins:
> 2007-10-17 15:15:55,657 INFO plugin.PluginRepository -
the nutch
> core extension points (nutch-extensionpoints)
> 2007-10-17 15:15:55,657 INFO plugin.PluginRepository -
Html Parse
> Plug-in (parse-html)
> 2007-10-17 15:15:55,657 INFO plugin.PluginRepository -
Exe Parse
> Plug-in (parse-exe)
>
> but when the fetcher encounters a x-dosexec file it
thorws an exception:
>
> 2007-10-17 15:17:16,146 WARN parse.ParseUtil - No
suitable parser found
> when trying to parse content http://www.foo.com/yyy
/foo.exe of type
> application/x-dosexec
> 2007-10-17 15:17:16,146 WARN fetcher.Fetcher - Error
parsing:
> http://www.foo.com
/yyy/movie30.exe
>
> (sorry, but the url has been masked for security
reasons)
>
> Am i missing something??
>
> thanks !!
>
>
>
> [$NUTCH_HOME/src/plugins/build.xml]
>
> <ant dir="parse-exe"
target="deploy"/>
>
> [parse-plugins.xml]
>
> <mimeType
name="application/x-dosexec">
> <plugin id="parse-exe"
/>
> </mimeType>
>
>
> [plugin.xml] // copied and changed from parse-pdf
>
> <?xml version="1.0"
encoding="UTF-8"?>
> <plugin
> id="parse-exe"
> name="Exe Parse Plug-in"
> version="1.0.0"
> provider-name="nutch.org">
>
> <runtime>
> <library name="parse-exe.jar">
> <export name="*"/>
> </library>
> </runtime>
>
> <requires>
> <import
plugin="nutch-extensionpoints"/>
> <import plugin="lib-log4j"/>
> </requires>
>
> <extension
id="org.apache.nutch.parse.exe"
> name="ExeParse"
>
point="org.apache.nutch.parse.Parser">
>
> <implementation
id="org.apache.nutch.parse.exe.ExeParse"
>
class="org.apache.nutch.parse.exe.ExeParse">
> <parameter name="contentType"
value="application/x-dosexec"/>
> <parameter name="pathSuffix"
value=""/>
> </implementation>
> </extension>
>
> </plugin>
>
>
------------------------------------------------------------
------------------
> -----------------------------------
>
> [build.xml]
>
> <?xml version="1.0"?>
>
> <project name="parse-exe"
default="jar-core">
>
> <import file="../build-plugin.xml"/>
>
> </project>
>
>
------------------------------------------------------------
------------
> [ExeParser.java]
>
> public class ExeParser implements Parser {
> public static final Log LOG =
LogFactory.getLog("
> org.apache.nutch.parse.exe");
> private Configuration conf;
>
> public Parse getParse(Content content) {
>
> try {
>
> byte[] raw = content.getContent();
>
> // enter here my code ( i will replace this with
real code)
> LOG.info ("EDRI:: you have reached the
parse-exe plugin!");
> System.out.println("EDRI::
system.out.print... parse-exe");
>
>
>
>
> String contentLength =
content.getMetadata().get(
> Response.CONTENT_LENGTH);
> if (contentLength != null && raw.length
!=
> Integer.parseInt(contentLength))
> {
> return new ParseStatus(ParseStatus.FAILED,
> ParseStatus.FAILED_TRUNCATED,
> "Content truncated at
"+raw.length
> +" bytes. Parser can't handle
incomplete exe
> file.").getEmptyParse(getConf());
> }
>
> } catch (Exception e) { // run time exception
> if (LOG.isWarnEnabled()) {
> LOG.warn("General exception in EXE
parser: "+e.getMessage());
>
e.printStackTrace(LogUtil.getWarnStream(LOG));
> }
> return new ParseStatus(ParseStatus.FAILED,
> "Can't be handled as exe document.
" +
> e).getEmptyParse(getConf());
> }
>
> /// i'm not sure what to return here if i only need
to d/l the file
>
> ParseData parseData = new
ParseData(ParseStatus.STATUS_SUCCESS, "",null,
> null, null);
> parseData.setConf(this.conf);
> return new ParseImpl("", parseData);
> }
>
> public void setConf(Configuration conf) {
> this.conf = conf;
> }
>
> public Configuration getConf() {
> return this.conf;
> }
>
>
>
>
______________________________________________
Chris Mattmann, Ph.D.
Chris.Mattmann jpl.nasa.gov
Cognizant Development Engineer
Early Detection Research Network Project
_________________________________________________
Jet Propulsion Laboratory Pasadena, CA
Office: 171-266B Mailstop: 171-246
_______________________________________________________
Disclaimer: The opinions presented within are my own and do
not reflect
those of either NASA, JPL, or the California Institute of
Technology.
|
|
| Re: writing a new parse-exe plugin |

|
2007-10-18 03:23:21 |
Excellent !!
that did the trick!
Any chance to create a new page on the plugin central for
writing a
nutch-0.9 plugin, stating the checklist (written below)?
(i would have uploaded, but dont have the rights to open a
new page)
The checklist: (relevant for a parse plugin, implementing
the Parse
extention point)
1. Create new dir under
$NUTCH_HOME/src/plugins/parse-XXX
2. Create new
$NUTCH_HOME/src/plugins/parse-XXX/plugin.xml [displayed
below]
3. Create new $NUTCH_HOME/src/plugins/parse-XXX/build.xml
[displayed
below]
4. Write the java code
$NUTCH_HOME/src/plugin/parse-XXX/src/java/org/apache/nutch/p
arse/XXX/XXXParser.java
5. Add "nutch-extensionpoints" &
"parse-XXX" to the 'plugins-include'
property in $NUTCH_HOME/conf/nutch-site.xml
6. Add code to the $NUTCH_HOME/conf/parse-plugins.xml
[written below]
(new mime type & alias)
7. Added code the the $NUTCH_HOME/src/plugins/build.xml
[written
below]
8. copied
$NUTCH_HOME/build/plugins/parse-XXX/parse-XXX.jar to
$NUTCH_HOME/plugins/parse-XXX
9. run ant (build successful)
I've got a few of more questions just to tie the loose
ends..:
1. Exe extension has a few content types related to it
(e.g.
application(x-exe|x-msdos|x-msdownload|octet-strem))
how can i config parse-exe to capture all of them?
2. i've noticed that after every build i need to copy
build/parse-exe/parse-
exe.jar to plugins/parse-exe, any way to tell him to build
it directly
to plugins/parse-exe?
3. i get a nullPointerException from fetcher after the
parse-exe works, can
you guide me on what i should return from the parse-exe?
the parse-exe plugin: ( the getParse funtion)
public class ExeParser implements Parser {
public static final Log LOG =
LogFactory.getLog(ExeParser.class);
private Configuration conf;
public static final String DOWNLOAD_DIR =
"/home/eyale/HTTPSEC/nutch-0.9
/DOWNLOADS/";
public ExeParser() {
LOG.info ("EDRI:: created exe-parser
object");
}
public Parse getParse(Content content) {
String resultText = null;
String resultTitle = null;
Outlink[] outlinks = null;
try {
byte[] raw = content.getContent();
// enter here my code
String contentLength = content.getMetadata().get(
Response.CONTENT_LENGTH);
if (contentLength != null && raw.length !=
Integer.parseInt(contentLength))
{
return new ParseStatus(ParseStatus.FAILED,
ParseStatus.FAILED_TRUNCATED,
"Content truncated at
"+raw.length
+" bytes. Parser can't handle incomplete
exe
file.").getEmptyParse(getConf());
}
// download the file
downloadContentType(content);
}catch (Exception e) { // run time exception
if (LOG.isWarnEnabled()) {
LOG.warn("General exception in EXE parser:
"+e.getMessage());
e.printStackTrace(LogUtil.getWarnStream(LOG));
}
return new ParseStatus(ParseStatus.FAILED,
"Can't be handled as exe document. "
+
e).getEmptyParse(getConf());
}
final ParseData parseData = new
ParseData(ParseStatus.STATUS_SUCCESS,
resultTitle,
outlinks,
content.getMetadata());
return new ParseImpl(resultText, parseData);
}
Thanks!!!
On 10/17/07, eyal edri <eyal.edri gmail.com> wrote:
>
> Hi all,
>
> I'm trying to write a new plugin that will download
pages with
> contentType: x-dosexec (EXE) files.
> i've followed the "write your own plugin
tutorial" in the wiki and done
> the following actions: (some actions are not mentioned
in the tutorial)
>
> 1. Created a new dir under
$NUTCH_HOME/src/plugins/parse-exe
> 2. Created new
$NUTCH_HOME/src/plugins/parse-exe/plugin.xml
> [displayed below]
> 3. Created new
$NUTCH_HOME/src/plugins/parse-exe/build.xml
> [displayed below]
> 4. Written the java code
>
$NUTCH_HOME/src/plugin/parse-exe/src/java/org/apache/nutch/p
arse/exe/ExeParser.java
> 5. Add "nutch-extensionpoints" &
"parse-exe" to the
> 'plugins-include' property in
$NUTCH_HOME/conf/nutch- site.xml
> 6. Add code to the
$NUTCH_HOME/conf/parse-plugins.xml [written
> below]
> 7. Added code the the
$NUTCH_HOME/src/plugins/build.xml [written
> below]
> 8. copied $NUTCH_HOME/build/plugins/parse-exe/parse-
exe.jar to
> $NUTCH_HOME/plugins/parse-exe
> 9. run ant (build successful)
>
> the log shows that nutch identifies the plugin:
>
> 2007-10-17 15:15:55,657 INFO plugin.PluginRepository -
Registered
> Plugins:
> 2007-10-17 15:15:55,657 INFO plugin.PluginRepository -
the nutch
> core extension points (nutch-extensionpoints)
> 2007-10-17 15:15:55,657 INFO plugin.PluginRepository -
Html Parse
> Plug-in (parse-html)
> 2007-10-17 15:15:55,657 INFO plugin.PluginRepository -
Exe Parse
> Plug-in (parse-exe)
>
> but when the fetcher encounters a x-dosexec file it
thorws an exception:
>
> 2007-10-17 15:17:16,146 WARN parse.ParseUtil - No
suitable parser found
> when trying to parse content http://www.foo.com/yyy
/foo.exe of type
> application/x-dosexec
> 2007-10-17 15:17:16,146 WARN fetcher.Fetcher - Error
parsing:
> http://www.foo.com
/yyy/movie30.exe
>
> (sorry, but the url has been masked for security
reasons)
>
> Am i missing something??
>
> thanks !!
>
>
>
> [$NUTCH_HOME/src/plugins/build.xml]
>
> <ant dir="parse-exe"
target="deploy"/>
>
> [parse-plugins.xml]
>
> <mimeType
name="application/x-dosexec">
> <plugin id="parse-exe"
/>
> </mimeType>
>
>
> [plugin.xml] // copied and changed from parse-pdf
>
> <?xml version="1.0"
encoding="UTF-8"?>
> <plugin
> id="parse-exe"
> name="Exe Parse Plug-in"
> version="1.0.0"
> provider-name="nutch.org">
>
> <runtime>
> <library name="parse-exe.jar">
> <export name="*"/>
> </library>
> </runtime>
>
> <requires>
> <import
plugin="nutch-extensionpoints"/>
> <import plugin="lib-log4j"/>
> </requires>
>
> <extension
id="org.apache.nutch.parse.exe"
> name="ExeParse"
>
point="org.apache.nutch.parse.Parser">
>
> <implementation
id="org.apache.nutch.parse.exe.ExeParse"
> class="
org.apache.nutch.parse.exe.ExeParse">
> <parameter name="contentType"
value="application/x-dosexec"/>
> <parameter name="pathSuffix"
value=""/>
> </implementation>
> </extension>
>
> </plugin>
>
>
>
------------------------------------------------------------
-----------------------------------------------------
>
> [build.xml]
>
> <?xml version=" 1.0"?>
>
> <project name="parse-exe"
default="jar-core">
>
> <import file="../build-plugin.xml"/>
>
> </project>
>
>
------------------------------------------------------------
------------
> [ExeParser.java]
>
> public class ExeParser implements Parser {
> public static final Log LOG =
LogFactory.getLog("
> org.apache.nutch.parse.exe");
> private Configuration conf;
>
> public Parse getParse(Content content) {
>
> try {
>
> byte[] raw = content.getContent();
>
> // enter here my code ( i will replace this with
real code)
> LOG.info ("EDRI:: you have reached the
parse-exe plugin!");
> System.out.println("EDRI::
system.out.print... parse-exe");
>
>
>
>
> String contentLength =
content.getMetadata().get(
> Response.CONTENT_LENGTH );
> if (contentLength != null && raw.length
!= Integer.parseInt(contentLength))
> {
> return new ParseStatus(ParseStatus.FAILED,
> ParseStatus.FAILED_TRUNCATED,
> "Content truncated at
"+raw.length
> +" bytes. Parser can't handle
incomplete exe
> file.").getEmptyParse(getConf());
> }
>
> } catch (Exception e) { // run time exception
> if (LOG.isWarnEnabled()) {
> LOG.warn("General exception in EXE
parser: "+e.getMessage());
>
e.printStackTrace(LogUtil.getWarnStream(LOG));
> }
> return new ParseStatus(ParseStatus.FAILED,
> "Can't be handled as exe document.
" +
> e).getEmptyParse(getConf());
> }
>
> /// i'm not sure what to return here if i only need
to d/l the file
>
> ParseData parseData = new
ParseData(ParseStatus.STATUS_SUCCESS,
> "",null, null, null);
> parseData.setConf(this.conf);
> return new ParseImpl("", parseData);
> }
>
> public void setConf(Configuration conf) {
> this.conf = conf;
> }
>
> public Configuration getConf() {
> return this.conf;
> }
>
>
>
>
>
> --
> Eyal Edri
--
Eyal Edri
|
|
| Re: writing a new parse-exe plugin |

|
2007-10-18 04:11:59 |
Found how to associate multiple contentTypes to a certain
plugin:
just add the content type to the conf/parse-plugins.xml
file: (the plugin
can take more than one type)
<mimeType name="application/x-dosexec">
<plugin id="parse-exe" />
</mimeType>
On 10/18/07, eyal edri <eyal.edri gmail.com> wrote:
>
> Excellent !!
>
> that did the trick!
>
> Any chance to create a new page on the plugin central
for writing a
> nutch-0.9 plugin, stating the checklist (written
below)?
> (i would have uploaded, but dont have the rights to
open a new page)
>
> The checklist: (relevant for a parse plugin,
implementing the Parse
> extention point)
>
> 1. Create new dir under
$NUTCH_HOME/src/plugins/parse-XXX
> 2. Create new
$NUTCH_HOME/src/plugins/parse-XXX/plugin.xml
> [displayed below]
> 3. Create new
$NUTCH_HOME/src/plugins/parse-XXX/build.xml [displayed
> below]
> 4. Write the java code
>
$NUTCH_HOME/src/plugin/parse-XXX/src/java/org/apache/nutch/p
arse/XXX/XXXParser.java
> 5. Add "nutch-extensionpoints" &
"parse-XXX" to the
> 'plugins-include' property in
$NUTCH_HOME/conf/nutch- site.xml
> 6. Add code to the
$NUTCH_HOME/conf/parse-plugins.xml [written
> below] (new mime type & alias)
> 7. Added code the the
$NUTCH_HOME/src/plugins/build.xml [written
> below]
> 8. copied $NUTCH_HOME/build/plugins/parse-XXX/parse-
XXX.jar to
> $NUTCH_HOME/plugins/parse-XXX
> 9. run ant (build successful)
>
> I've got a few of more questions just to tie the loose
ends..:
>
> 1. Exe extension has a few content types related to it
(e.g.
> application(x-exe|x-msdos|x-msdownload|octet-strem))
> how can i config parse-exe to capture all of them?
(solved)
> 2. i've noticed that after every build i need to copy
> build/parse-exe/parse-exe.jar to plugins/parse-exe, any
way to tell him to
> build it directly
> to plugins/parse-exe?
> 3. i get a nullPointerException from fetcher after the
parse-exe works,
> can you guide me on what i should return from the
parse-exe?
>
> the parse-exe plugin: ( the getParse funtion)
>
> public class ExeParser implements Parser {
> public static final Log LOG =
LogFactory.getLog(ExeParser.class );
> private Configuration conf;
> public static final String DOWNLOAD_DIR =
"/home/eyale/HTTPSEC/nutch-0.9
> /DOWNLOADS/";
>
> public ExeParser() {
> LOG.info ("EDRI:: created exe-parser
object");
> }
>
> public Parse getParse(Content content) {
> String resultText = null;
> String resultTitle = null;
> Outlink[] outlinks = null;
>
> try {
>
> byte[] raw = content.getContent();
>
> // enter here my code
>
> String contentLength =
content.getMetadata().get(
> Response.CONTENT_LENGTH);
> if (contentLength != null && raw.length
!= Integer.parseInt(contentLength))
> {
> return new ParseStatus(ParseStatus.FAILED ,
> ParseStatus.FAILED_TRUNCATED,
> "Content truncated at
"+raw.length
> +" bytes. Parser can't handle
incomplete exe
> file.").getEmptyParse(getConf());
> }
> // download the file
> downloadContentType(content);
>
> }catch (Exception e) { // run time exception
> if (LOG.isWarnEnabled()) {
> LOG.warn("General exception in EXE
parser: "+e.getMessage());
> e.printStackTrace
(LogUtil.getWarnStream(LOG));
> }
> return new ParseStatus(ParseStatus.FAILED,
> "Can't be handled as exe document.
" +
> e).getEmptyParse(getConf());
> }
>
> final ParseData parseData = new ParseData(
ParseStatus.STATUS_SUCCESS,
>
resultTitle, outlinks,
>
content.getMetadata());
> return new ParseImpl(resultText, parseData);
> }
>
> Thanks!!!
>
>
>
>
>
>
> On 10/17/07, eyal edri < eyal.edri gmail.com> wrote:
> >
> > Hi all,
> >
> > I'm trying to write a new plugin that will
download pages with
> > contentType: x-dosexec (EXE) files.
> > i've followed the "write your own plugin
tutorial" in the wiki and done
> > the following actions: (some actions are not
mentioned in the tutorial)
> >
> > 1. Created a new dir under
$NUTCH_HOME/src/plugins/parse-exe
> > 2. Created new
$NUTCH_HOME/src/plugins/parse-exe/plugin.xml
> > [displayed below]
> > 3. Created new
$NUTCH_HOME/src/plugins/parse-exe/build.xml
> > [displayed below]
> > 4. Written the java code
> >
$NUTCH_HOME/src/plugin/parse-exe/src/java/org/apache/nutch/p
arse/exe/ExeParser.java
> > 5. Add "nutch-extensionpoints" &
"parse-exe" to the
> > 'plugins-include' property in
$NUTCH_HOME/conf/nutch- site.xml
> > 6. Add code to the
$NUTCH_HOME/conf/parse-plugins.xml [written
> > below]
> > 7. Added code the the
$NUTCH_HOME/src/plugins/build.xml [written
> > below]
> > 8. copied
$NUTCH_HOME/build/plugins/parse-exe/parse- exe.jar to
> > $NUTCH_HOME/plugins/parse-exe
> > 9. run ant (build successful)
> >
> > the log shows that nutch identifies the plugin:
> >
> > 2007-10-17 15:15:55,657 INFO
plugin.PluginRepository - Registered
> > Plugins:
> > 2007-10-17 15:15:55,657 INFO
plugin.PluginRepository - the
> > nutch core extension points
(nutch-extensionpoints)
> > 2007-10-17 15:15:55,657 INFO
plugin.PluginRepository - Html
> > Parse Plug-in (parse-html)
> > 2007-10-17 15:15:55,657 INFO
plugin.PluginRepository - Exe
> > Parse Plug-in (parse-exe)
> >
> > but when the fetcher encounters a x-dosexec file
it thorws an exception:
> >
> >
> > 2007-10-17 15:17:16,146 WARN parse.ParseUtil - No
suitable parser found
> > when trying to parse content http://www.foo.com/yyy
/foo.exe of type
> > application/x-dosexec
> > 2007-10-17 15:17:16,146 WARN fetcher.Fetcher -
Error parsing:
> > http://www.foo.com
/yyy/movie30.exe
> >
> > (sorry, but the url has been masked for security
reasons)
> >
> > Am i missing something??
> >
> > thanks !!
> >
> >
> >
> > [$NUTCH_HOME/src/plugins/build.xml]
> >
> > <ant dir="parse-exe"
target="deploy"/>
> >
> > [parse-plugins.xml]
> >
> > <mimeType
name="application/x-dosexec">
> > <plugin
id="parse-exe" />
> > </mimeType>
> >
> >
> > [plugin.xml] // copied and changed from parse-pdf
> >
> > <?xml version="1.0"
encoding="UTF-8"?>
> > <plugin
> > id="parse-exe"
> > name="Exe Parse Plug-in"
> > version="1.0.0"
> > provider-name="nutch.org">
> >
> > <runtime>
> > <library
name="parse-exe.jar">
> > <export name="*"/>
> > </library>
> > </runtime>
> >
> > <requires>
> > <import
plugin="nutch-extensionpoints"/>
> > <import
plugin="lib-log4j"/>
> > </requires>
> >
> > <extension
id="org.apache.nutch.parse.exe"
> > name="ExeParse"
> >
point="org.apache.nutch.parse.Parser">
> >
> > <implementation
id="org.apache.nutch.parse.exe.ExeParse"
> > class="
org.apache.nutch.parse.exe.ExeParse">
> > <parameter name="contentType"
value="application/x-dosexec"/>
> > <parameter name="pathSuffix"
value=""/>
> > </implementation>
> > </extension>
> >
> > </plugin>
> >
> >
> >
------------------------------------------------------------
-----------------------------------------------------
> >
> > [build.xml]
> >
> > <?xml version=" 1.0"?>
> >
> > <project name="parse-exe"
default="jar-core">
> >
> > <import
file="../build-plugin.xml"/>
> >
> > </project>
> >
> >
------------------------------------------------------------
------------
> > [ExeParser.java]
> >
> > public class ExeParser implements Parser {
> > public static final Log LOG =
LogFactory.getLog("
> > org.apache.nutch.parse.exe");
> > private Configuration conf;
> >
> > public Parse getParse(Content content) {
> >
> > try {
> >
> > byte[] raw = content.getContent();
> >
> > // enter here my code ( i will replace this
with real code)
> > LOG.info ("EDRI:: you have reached the
parse-exe plugin!");
> > System.out.println("EDRI::
system.out.print... parse-exe");
> >
> >
> >
> >
> > String contentLength =
content.getMetadata().get(
> > Response.CONTENT_LENGTH );
> > if (contentLength != null &&
raw.length != Integer.parseInt(contentLength))
> > {
> > return new
ParseStatus(ParseStatus.FAILED,
> > ParseStatus.FAILED_TRUNCATED,
> > "Content truncated at
"+raw.length
> > +" bytes. Parser can't handle
incomplete exe
> > file.").getEmptyParse(getConf());
> > }
> >
> > } catch (Exception e) { // run time exception
> > if (LOG.isWarnEnabled()) {
> > LOG.warn("General exception in EXE
parser: "+e.getMessage());
> >
e.printStackTrace(LogUtil.getWarnStream(LOG));
> > }
> > return new
ParseStatus(ParseStatus.FAILED,
> > "Can't be handled as exe
document. " +
> > e).getEmptyParse(getConf());
> > }
> >
> > /// i'm not sure what to return here if i only
need to d/l the file
> >
> > ParseData parseData = new
ParseData(ParseStatus.STATUS_SUCCESS,
> > "",null, null, null);
> > parseData.setConf(this.conf);
> > return new ParseImpl("",
parseData);
> > }
> >
> > public void setConf(Configuration conf) {
> > this.conf = conf;
> > }
> >
> > public Configuration getConf() {
> > return this.conf;
> > }
> >
> >
> >
> >
> >
> > --
> > Eyal Edri
>
>
>
>
> --
> Eyal Edri
--
Eyal Edri
|
|
[1-4]
|
|