`

Nutch package 下的build.xml解读

阅读更多
花了很多时间来读这个build.xml,并不是这个包不好读,相反可读性非常强,这就是xml的好处.自己花了很多的时间

去熟悉ant!从Ant的task,type,nested element等一点一点读起.这样整个nutch的配置结构,我就了解的更加清楚了.

解下来的任务就是继续熟悉一下Javacc这个软件,在读一下它的api文档,下来的工作就是修改nutch了.

将build.xml的内容粘于此处.

<?xml version='1.0' encoding='GBK'?>
   <!--如果出现汉字,请使用GBK编码-->

<project name="Nutch" default="compile">

<!--修改意见-->
<!--可以写一个deploy模块,把war文件拷贝到tomcat/webapps下面去,这部分文中没有涉及-->
  <!-- Load all the default properties, and any the user wants    -->
  <!-- to contribute (without having to type -D or edit this file -->
  <!--没有找到build.properties??-->
  <!--properry为task-->
  <property file="${user.home}/build.properties" />
  <property file="${basedir}/build.properties" />
  <property file="${basedir}/default.properties" />

  <!-- the normal classpath -->
   <!--build.classes下含有lib.dir下的所有的jar文件-->
  <!--path-like structure,定义classpath-->
  <path id="classpath">
    <pathelement location="${build.classes}"/>
    <!--fileset为type-->
    <fileset dir="${lib.dir}">
      <include name="*.jar" />
    </fileset>
  </path>

  <!-- the unit test classpath -->
  <!--dirname为Task-->
  <dirname property="plugins.classpath.dir" file="${build.plugins}"/>
  <path id="test.classpath">
    <pathelement location="${test.build.classes}" />
    <pathelement location="${conf.dir}"/>
    <pathelement location="${test.src.dir}"/>
    <pathelement location="${plugins.classpath.dir}"/>
    <!--使用前面定义的classpath-->
    <path refid="classpath"/>
  </path>

  <!-- xmlcatalog definition for xslt task -->
  <!--使用xmlcatalog type定义docDTDs-->
  <xmlcatalog id="docDTDs">
     <dtd publicId="-//W3C//DTD XHTML 1.0 Transitional//EN"           
          location="${xmlcatalog.dir}/xhtml1-transitional.dtd"/>
  </xmlcatalog>

  <!-- ====================================================== -->
  <!-- Stuff needed by all targets                            -->
  <!-- ====================================================== -->
  <target name="init">
    <!--mkdir为task,创建用于build和test的目录-->
    <mkdir dir="${build.dir}"/>
    <mkdir dir="${build.classes}"/>

    <mkdir dir="${test.build.dir}"/>
    <mkdir dir="${test.build.classes}"/>
   
    <!--将conf.dir中对应格式的文件修改时间设为datetime-->
    <touch datetime="01/25/1971 2:00 pm">
      <fileset dir="${conf.dir}" includes="**/*.template"/>
    </touch>

    <copy todir="${conf.dir}" verbose="true">
      <fileset dir="${conf.dir}" includes="**/*.template"/>
      <!--mapper为type,将*.template转为*-->
      <mapper type="glob" from="*.template" to="*"/>
    </copy>


  </target>

  <!-- ====================================================== -->
  <!-- Compile the Java files                                 -->
  <!-- ====================================================== -->
  <target name="compile" depends="compile-core, compile-plugins, jar"/>

  <target name="compile-core" depends="init">
    <!--将生成的class文件存入build/classes-->
    <javac
     encoding="${build.encoding}"
     srcdir="${src.dir}"
     includes="org/apache/nutch/**/*.java"
     destdir="${build.classes}"
     debug="${debug}"
     optimize="${optimize}"
     deprecation="${deprecation}">
      <!--使用已经定义的classpath-->
      <classpath refid="classpath"/>
    </javac>   
  </target>

  <target name="compile-plugins">
    <!--执行src/plugin下面的build.xml,并且执行其中的deploytarget,不继承property-->
    <ant dir="src/plugin" target="deploy" inheritAll="false"/>
  </target>

  <target name="generate-src" depends="init">
    <!--使用javacc命令-->
    <javacc target="${src.dir}/org/apache/nutch/quality/dynamic/PageDescription.jj"
            javacchome="${javacc.home}">
    </javacc>
    <!--对NutchAnalysis.jj运行javacc,我们修改分词需要修改此接口-->
    <javacc target="${src.dir}/org/apache/nutch/analysis/NutchAnalysis.jj"
            javacchome="${javacc.home}">
    </javacc>
    <!--CR即为carriage(回车),LF即为Line Feed(换行)即将src.dir下的匹配文件eof以lf代替-->
    <fixcrlf srcdir="${src.dir}" eol="lf" includes="**/*.java"/>

  </target>

  <target name="dynamic" depends="generate-src, compile">
  </target>


  <!-- ================================================================== -->
  <!-- Make nutch.jar                                                     -->
  <!-- ================================================================== -->
  <!--                                                                    -->
  <!-- ================================================================== -->
  <target name="jar" depends="compile-core">
    <!--将conf目录下的nutch-default.xml和nutch-site.xml拷入build文件目下-->
    <copy file="${conf.dir}/nutch-default.xml"
          todir="${build.classes}"/>
    <copy file="${conf.dir}/nutch-site.xml"
          todir="${build.classes}"/>
    <!--将build/classes下的文件打包成jar-->
    <jar
      jarfile="${build.dir}/${final.name}.jar"
      basedir="${build.classes}"
    />
    <!--将生成的nutch-0.7.jar拷贝到根目录下命名为nutch.jar-->
    <copy file="${build.dir}/${final.name}.jar" tofile="${basedir}/${name}.jar" />
  </target>

  <!-- ================================================================== -->
  <!-- Make nutch.war                                                     -->
  <!-- ================================================================== -->
  <!--                                                                    -->
  <!-- ================================================================== -->
  <target name="war" depends="compile,generate-docs">
    <!--war是一个task,将里面包含的所有的东西打包成一个war文件,如果需要修改前台的界面包括
    nutch的logo等等,都要提前修改其中的一些图片,界面.最后打好包放到tomcat中-->
    <war destfile="${build.dir}/${final.name}.war"
  webxml="${web.src.dir}/web.xml">
      <fileset dir="${web.src.dir}/jsp"/>
      <zipfileset dir="${docs.src}" includes="include/*.html"/>
      <zipfileset dir="${build.docs}" includes="*/include/*.html"/>
      <fileset dir="${docs.dir}"/>
      <lib dir="${lib.dir}">
<include name="lucene*.jar"/>
<include name="taglibs-*.jar"/>
<include name="dom4j-*.jar"/>
<include name="xerces-*.jar"/>
      </lib>
      <lib dir="${build.dir}">
<include name="${final.name}.jar"/>
      </lib>
      <classes dir="${conf.dir}" excludes="**/*.template"/>
      <classes dir="${web.src.dir}/locale"/>
      <zipfileset prefix="WEB-INF/classes/plugins" dir="${build.plugins}"/>
      <webinf dir="${lib.dir}">
<include name="taglibs-*.tld"/>
      </webinf>
    </war>
   </target>


  <!-- ================================================================== -->
  <!-- Compile test code                                                  -->
  <!-- ================================================================== -->
  <target name="compile-core-test" depends="compile-core">
    <javac
     encoding="${build.encoding}"
     srcdir="${test.src.dir}"
     includes="org/apache/nutch/**/*.java"
     destdir="${test.build.classes}"
     debug="${debug}"
     deprecation="${deprecation}">
     <!--使用先前定义的test.classpath-->
      <classpath refid="test.classpath"/>
    </javac>   
  </target>

  <!-- ================================================================== -->
  <!-- Run unit tests                                                     -->
  <!-- ================================================================== -->
  <target name="test" depends="test-core, test-plugins"/>

  <target name="test-core" depends="compile, compile-core-test">

    <delete dir="${test.build.data}"/>
    <mkdir dir="${test.build.data}"/>
    <!--使用nutch-site.xml-->
    <copy file="${test.src.dir}/nutch-site.xml"
          todir="${test.build.classes}"/>
    <!--junit为一个task,支持nested element,譬如sysproperty,formatter,batchtest-->
    <junit printsummary="yes" haltonfailure="no" fork="yes" dir="${basedir}"
      errorProperty="tests.failed" failureProperty="tests.failed">
      <sysproperty key="test.build.data" value="${test.build.data}"/>
      <sysproperty key="test.src.dir" value="${test.src.dir}"/>
      <classpath refid="test.classpath"/>
      <formatter type="plain" />
      <batchtest todir="${test.build.dir}" unless="testcase">
        <fileset dir="${test.src.dir}"
                 includes="**/Test*.java" excludes="**/${test.exclude}.java" />
      </batchtest>
      <batchtest todir="${test.build.dir}" if="testcase">
        <fileset dir="${test.src.dir}" includes="**/${testcase}.java"/>
      </batchtest>
    </junit>
    <!--fail为task-->
    <fail if="tests.failed">Tests failed!</fail>

  </target>  

  <target name="test-plugins" depends="compile">
    <ant dir="src/plugin" target="test" inheritAll="false"/>
  </target>

  <target name="nightly" depends="test, tar">
  </target>

  <!-- ================================================================== -->
  <!-- Documentation                                                      -->
  <!-- ================================================================== -->
  <target name="javadoc" depends="compile">
    <mkdir dir="${build.javadoc}"/>
    <!--javadoc为task,支持nested element,譬如packageset,link,group-->
    <javadoc
      overview="${src.dir}/overview.html"
      destdir="${build.javadoc}"
      author="true"
      version="true"
      use="true"
      windowtitle="${Name} ${version} API"
      doctitle="${Name} ${version} API"
      bottom="Copyright &amp;copy; ${year} The Apache Software Foundation"
      >
     <packageset dir="${src.dir}"/>
     <packageset dir="${plugins.dir}/protocol-file/src/java"/>
     <packageset dir="${plugins.dir}/protocol-ftp/src/java"/>
        <packageset dir="${plugins.dir}/protocol-http/src/java"/>
        <packageset dir="${plugins.dir}/protocol-httpclient/src/java"/>
     <packageset dir="${plugins.dir}/parse-html/src/java"/>
     <packageset dir="${plugins.dir}/parse-js/src/java"/>
     <packageset dir="${plugins.dir}/parse-text/src/java"/>
     <packageset dir="${plugins.dir}/parse-pdf/src/java"/>
<!-- <packageset dir="${plugins.dir}/parse-rtf/src/java"/> plugin excluded from build due to licensing issues-->
<!-- <packageset dir="${plugins.dir}/parse-mp3/src/java"/> plugin excluded from build due to licensing issues-->
     <packageset dir="${plugins.dir}/parse-msword/src/java"/>
     <packageset dir="${plugins.dir}/index-basic/src/java"/>
     <packageset dir="${plugins.dir}/index-more/src/java"/>
     <packageset dir="${plugins.dir}/query-more/src/java"/>
     <packageset dir="${plugins.dir}/urlfilter-regex/src/java"/>
     <packageset dir="${plugins.dir}/urlfilter-prefix/src/java"/>
     <packageset dir="${plugins.dir}/creativecommons/src/java"/>
     <packageset dir="${plugins.dir}/languageidentifier/src/java"/>
     <packageset dir="${plugins.dir}/clustering-carrot2/src/java"/>
     <packageset dir="${plugins.dir}/ontology/src/java"/>
        <link href="${javadoc.link.java}"/>
        <link href="${javadoc.link.lucene}"/>
        <classpath refid="classpath"/>
     <classpath>
      <fileset dir="${plugins.dir}" >
       <include name="**/*.jar"/>
      </fileset>
     </classpath>
     <group title="Core" packages="org.apache.nutch.*"/>
     <group title="Plugins" packages="${plugins.packages}"/>
    </javadoc>
  </target>

  <target name="default-doc">
    <!--style即xslt命令,使用xsl文件解析xml-->
    <style basedir="${conf.dir}" destdir="${docs.dir}"
           includes="nutch-default.xml" style="conf/nutch-conf.xsl"/>
  </target>

  <target name="generate-locale" if="doc.locale">
    <echo message="Generating docs for locale=${doc.locale}"/>

    <mkdir dir="${build.docs}/${doc.locale}/include"/>
    <xslt in="${docs.src}/include/${doc.locale}/header.xml"
          out="${build.docs}/${doc.locale}/include/header.html"
          style="${docs.src}/style/nutch-header.xsl">
        <xmlcatalog refid="docDTDs"/>
    </xslt>

    <dependset>
       <srcfileset dir="${docs.src}/include/${doc.locale}" includes="*.xml"/>
       <srcfileset dir="${docs.src}/style" includes="*.xsl"/>
       <targetfileset dir="${docs.dir}/${doc.locale}" includes="*.html"/>
    </dependset> 

    <copy file="${docs.src}/style/nutch-page.xsl"
          todir="${build.docs}/${doc.locale}"
          preservelastmodified="true"/>

    <xslt basedir="${docs.src}/pages/${doc.locale}"
          destdir="${docs.dir}/${doc.locale}"
          includes="*.xml"
          style="${build.docs}/${doc.locale}/nutch-page.xsl">
         <xmlcatalog refid="docDTDs"/>
    </xslt>
  </target>


  <target name="generate-docs" depends="init">
    <dependset>
       <srcfileset dir="${docs.src}/include" includes="*.html"/>
       <targetfileset dir="${docs.dir}" includes="**/*.html"/>
    </dependset> 

    <mkdir dir="${build.docs}/include"/>
    <copy todir="${build.docs}/include">
      <fileset dir="${docs.src}/include"/>
    </copy>
     <!--antcall为task,调用generate-locale,并将相应的参数传递过去-->
    <antcall target="generate-locale">
      <param name="doc.locale" value="ca"/>
    </antcall>

    <antcall target="generate-locale">
      <param name="doc.locale" value="de"/>
    </antcall>

    <antcall target="generate-locale">
      <param name="doc.locale" value="en"/>
    </antcall>
   
    <antcall target="generate-locale">
      <param name="doc.locale" value="es"/>
    </antcall>
   
    <antcall target="generate-locale">
      <param name="doc.locale" value="fi"/>
    </antcall>
   
    <antcall target="generate-locale">
      <param name="doc.locale" value="fr"/>
    </antcall>
   
    <antcall target="generate-locale">
      <param name="doc.locale" value="hu"/>
    </antcall>

    <antcall target="generate-locale">
      <param name="doc.locale" value="jp"/>
    </antcall>

    <antcall target="generate-locale">
      <param name="doc.locale" value="ms"/>
    </antcall>

    <antcall target="generate-locale">
      <param name="doc.locale" value="nl"/>
    </antcall>

    <antcall target="generate-locale">
      <param name="doc.locale" value="pl"/>
    </antcall>

    <antcall target="generate-locale">
      <param name="doc.locale" value="pt"/>
    </antcall>

    <antcall target="generate-locale">
      <param name="doc.locale" value="sv"/>
    </antcall>

    <antcall target="generate-locale">
      <param name="doc.locale" value="th"/>
    </antcall>

    <antcall target="generate-locale">
      <param name="doc.locale" value="zh"/>
    </antcall>

    <fixcrlf srcdir="${docs.dir}" eol="lf" encoding="utf-8"
             includes="**/*.html"/>

  </target>

  <!-- ================================================================== -->
  <!-- D I S T R I B U T I O N                                            -->
  <!-- ================================================================== -->
  <!--                                                                    -->
  <!-- ================================================================== -->
  <!--将所有的东西,包括.jar,.war等文件都拷贝到dist目录中-->
  <target name="package" depends="jar, war, javadoc">
    <mkdir dir="${dist.dir}"/>
    <mkdir dir="${dist.dir}/lib"/>
    <mkdir dir="${dist.dir}/bin"/>
    <mkdir dir="${dist.dir}/docs"/>
    <mkdir dir="${dist.dir}/docs/api"/>
    <mkdir dir="${dist.dir}/plugins"/>

    <copy todir="${dist.dir}/lib" includeEmptyDirs="false">
      <fileset dir="lib"/>
    </copy>

    <copy todir="${dist.dir}/plugins">
      <fileset dir="${build.plugins}"/>
    </copy>

    <copy file="${build.dir}/${final.name}.jar" todir="${dist.dir}"/>

    <copy file="${build.dir}/${final.name}.war" todir="${dist.dir}"/>

    <copy todir="${dist.dir}/bin">
      <fileset dir="bin"/>
    </copy>

    <copy todir="${dist.dir}/conf">
      <fileset dir="${conf.dir}" excludes="**/*.template"/>
    </copy>

    <chmod perm="ugo+x" type="file">
        <fileset dir="${dist.dir}/bin"/>
    </chmod>

    <copy todir="${dist.dir}/docs">
      <fileset dir="${docs.dir}"/>
    </copy>

    <copy todir="${dist.dir}/docs/api">
      <fileset dir="${build.javadoc}"/>
    </copy>

    <copy todir="${dist.dir}">
      <fileset dir=".">
        <include name="*.txt" />
      </fileset>
    </copy>

    <copy todir="${dist.dir}/src" includeEmptyDirs="false">
      <fileset dir="src"/>
    </copy>

    <copy todir="${dist.dir}/" file="build.xml"/>
    <copy todir="${dist.dir}/" file="default.properties"/>

  </target>

  <!-- ================================================================== -->
  <!-- Make release tarball                                               -->
  <!-- ================================================================== -->
  <!--打包成压缩文件-->
  <target name="tar" depends="package">
    <tar compression="gzip" longfile="gnu"
      destfile="${build.dir}/${final.name}.tar.gz">
      <tarfileset dir="${build.dir}" mode="664">
<exclude name="${final.name}/bin/*" />
        <include name="${final.name}/**" />
      </tarfileset>
      <tarfileset dir="${build.dir}" mode="755">
        <include name="${final.name}/bin/*" />
      </tarfileset>
    </tar>
  </target>

  <!-- ================================================================== -->
  <!-- Clean.  Delete the build files, and their directories              -->
  <!-- ================================================================== -->
  <target name="clean">
    <delete dir="${build.dir}"/>
    <delete failonerror="true" file="${name}.jar" />
  </target>

</project>

分享到:
评论
1 楼 koubi1986 2013-08-20  
你好!请教一些问题:
请问一下
1。你是如何把nutch抓取到的二进制内容,在项目中读取的。
2。nutch抓取时候,发现有很多url没有被成功抓取过来,请问你做的时候,有什么   办法是提高成功率。
3。对抓取过来内容你们是如何进行关键词搜索

希望群主可以帮忙解答一下。万分感激!
email:83132614@qq.com

相关推荐

Global site tag (gtag.js) - Google Analytics