Jump to content

Wikipedia:Most frequently edited pages/How to generate the lists

From Wikipedia, the free encyclopedia

This page explains how to generate the following lists.

Preconditions

[edit]

Namespaces.java

import java.util.HashMap;
import java.util.Map;

class Namespaces {
	
	public static final int MAIN_NAMESPACE = 0;
	
	private final Map<String, Integer> map = new HashMap<String, Integer>();
	
	public void add(String key, int ns) {
		map.put(key, ns);
	}
	
	public int ns(String text) {
		final String NAMESPACE_SEPARATOR = ":";
		if (!text.contains(NAMESPACE_SEPARATOR)) {
			return MAIN_NAMESPACE;
		}
		Integer ns = map.get(text.split(NAMESPACE_SEPARATOR)[0]);
		if (ns == null) {
			return MAIN_NAMESPACE;
		}
		return ns;
	}

}

Page.java

class Page {
	
	private final String title;
	
	private final int ns;
	
	private int edits;
	
	private int totalEdits;
	
	public String getTitle() {
		return title;
	}

	public int getNs() {
		return ns;
	}

	public int getEdits() {
		return edits;
	}

	public int getTotalEdits() {
		return totalEdits;
	}
	
	public Page(String title, int ns) {
		this.title = title;
		this.ns = ns;
	}
	
	public void incrementEdits() {
		edits++;
	}
	
	public void incrementTotalEdits() {
		totalEdits++;
	}

}

PagesByNumberOfRecentEdits.java

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Calendar;
import java.util.Comparator;
import java.util.Date;
import java.util.EmptyStackException;
import java.util.Stack;
import java.util.TimeZone;
import java.util.zip.GZIPInputStream;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParserFactory;

import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

public abstract class PagesByNumberOfRecentEdits {

	private static final String YEARMONTH_FORMAT_STRING = "yyyy-MM";
	private static final String DATE_FORMAT_STRING = YEARMONTH_FORMAT_STRING + "-dd";
	protected static final SimpleDateFormat DATE_FORMAT = new SimpleDateFormat(DATE_FORMAT_STRING);
	private static final String TIME_FORMAT_STRING = "HH:mm:ss";
	private static final Comparator<Page> PAGES_COMPARATOR = new Comparator<Page> {
		public int compare(Page page1, Page page2) {
			if (page1 == null || page2 == null) {
				if (page1 == null && page2 == null) {
					return 0;
				}
				if (page1 == null) {
					return 1;
				}
				if (page2 == null) {
					return -1;
				}
			}
			if (page1.getEdits() != page2.getEdits()) {
				return page2.getEdits() - page1.getEdits();
			} else {
				return page2.getTotalEdits() - page1.getTotalEdits(); 
			}
		}
	};
	
	private final Date dateStarted =  new Date();
	
	private static final String LIMIT_PROPERTY_KEY = "limit";
	private int limit = 0;
	
	private PrintWriter writer = null;
	
	protected PrintWriter getWriter() {
		return writer;
	}

	private Date beginTimestamp = null;
	private Date endTimestamp = null;
	
	protected Date getBeginTimestamp() {
		return beginTimestamp;
	}

	protected Date getEndTimestamp() {
		return endTimestamp;
	}

	protected void execute(String[] args) {
		
		try {
			final int VALID_ARGUMENT_LENGTH = 1;
			if (args.length < VALID_ARGUMENT_LENGTH) {
				printUsage();
				System.exit(1);
			}
			writer = new PrintWriter(new OutputStreamWriter(System.out, "UTF-8"));
			System.err.println("Started. " + dateStarted);
			String limitText = System.getProperty(LIMIT_PROPERTY_KEY, "5000");
			limit = Integer.parseInt(limitText);
			final File dumpFile = new File(args[0]);
			fileNameCheck(dumpFile);
			final DumpHandler dumpHandler = new DumpHandler();
			dumpHandler.setLimit(limit);
			SAXParserFactory.newInstance().newSAXParser().parse(
					new GZIPInputStream(new FileInputStream(dumpFile)), dumpHandler);
			final Page[] pages = dumpHandler.getPages();
			beginTimestamp = dumpHandler.getBeginTimestamp();
			endTimestamp = dumpHandler.getEndTimestamp();
			print(pages);
		} catch (NumberFormatException e) {
			System.err.println("The specified system property \"" + LIMIT_PROPERTY_KEY + "\" is not a valid integer.");
			System.err.println(e);
			System.exit(1);
		} catch (FileNotFoundException e) {
			System.err.println(e);
			System.exit(1);
		} catch (ParserConfigurationException e) {
			e.printStackTrace();
			System.exit(1);
		} catch (SAXException e) {
			e.printStackTrace();
			System.exit(1);
		} catch (IOException e) {
			e.printStackTrace();
			System.exit(1);
		} finally {
			final Date dateEnded = new Date();
			System.err.println("Ended. " + dateEnded);
			final SimpleDateFormat dateFormat = new SimpleDateFormat(TIME_FORMAT_STRING);
			dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
			System.err.println("Elapsed: " + dateFormat.format(new Date(dateEnded.getTime() - dateStarted.getTime())));
		}

	}
	
	private static final String BEGIN_DATE_PROPERTY_KEY = "begin.date";
	private static final String END_DATE_PROPERTY_KEY = "end.date";
	private static final String MINIMUM_EDITS_PROPERTY_KEY = "min.edits";
	
	private void printUsage() {
		System.err.print("Usage (example): java -D" + BEGIN_DATE_PROPERTY_KEY + "=2008-04-01"
											+ " -D" + END_DATE_PROPERTY_KEY + "=2008-04-30"
											+ " -D" + LIMIT_PROPERTY_KEY + "=5000"
											+ " -D" + MINIMUM_EDITS_PROPERTY_KEY + "=15");
		System.err.print(" " + getClass().getName());
		System.err.print(" " + getWikiName() + "-20080501-stub-meta-history.xml.gz");
		System.err.print(" > result.txt");
		System.err.println();
	}
	
	private void print(Page[] pages) {
		try {
			printHeader();
			Arrays.sort(pages, PAGES_COMPARATOR);
			writer.print("{| class=\"wikitable" + getSortable() + "\"");
			writer.println();
			writer.print("! " + getTableHeader());
			writer.println();
			int rank = 0;
			int prevCount = 0;
			int sameRank = 0;
			for (Page page : pages) {
				final String rankText;
				if (rank == 0) {
					rank++;
					sameRank = 1;
				} else if (page.getEdits() < prevCount) {
					rank += sameRank;
					sameRank = 1;
				} else {
					sameRank++;
				}
				rankText = Integer.toString(rank);
				prevCount = page.getEdits();
				if (rank > limit) {
					break;
				}
				writer.print("|-");
				writer.println();
				writer.print("| " + rankText);
				writer.print(" || ");
				writer.print("[[:" + page.getTitle() + "]]");
				writer.print(" || ");
				writer.print(page.getNs());
				writer.print(" || ");
				writer.print(page.getEdits());
				writer.print(" || ");
				writer.print(page.getTotalEdits());
				writer.println();
			}
			writer.print("|}");
			writer.println();
		} finally {
			writer.flush();
		}
	}
	
	private void fileNameCheck(File file) {
		if (!file.getName().startsWith(getWikiName())) {
			System.err.println("WARNING: The specified file name '" + file.getName() + "' does not start with '" + getWikiName() + "'.");
			try {
				Thread.sleep(5000);
			} catch(InterruptedException e) {
			}
		}
	}
	
	protected abstract String getWikiName();
	
	protected void printHeader() {
		return;
	}
	
	protected abstract String getTableHeader();
	
	protected final String SORTABLE = " sortable";
	
	protected String getSortable() {
		return SORTABLE;
	}

	private static class DumpHandler extends DefaultHandler {
		
		private final Namespaces namespaces = new Namespaces(); 
		
		private final Stack<String> elementStack = new Stack<String>();
		
		private Date beginTimestamp = null;
		private Date endTimestamp = null;
		
		public Date getBeginTimestamp() {
			return beginTimestamp;
		}

		public Date getEndTimestamp() {
			return endTimestamp;
		}

		private int minimumEdits = 0;
		
		private int limit = 0;
		
		private void setLimit(int limit) {
			this.limit = limit;
		}

		private static final DateFormat TIMESTAMP_DUMP_FORMAT
								= new SimpleDateFormat(DATE_FORMAT_STRING + "'T'" + TIME_FORMAT_STRING + "'Z'z");
		
		private int editsInLastMonth = 0;
		private Calendar lastMonth = Calendar.getInstance();
		
		public void startDocument() throws SAXException {
			beginTimestamp = getDateProperty(BEGIN_DATE_PROPERTY_KEY);
			final Calendar endTimestampCalendar = Calendar.getInstance();
			endTimestampCalendar.setTime(getDateProperty(END_DATE_PROPERTY_KEY));
			endTimestampCalendar.add(Calendar.HOUR, 23);
			endTimestampCalendar.add(Calendar.MINUTE, 59);
			endTimestampCalendar.add(Calendar.SECOND, 59);
			endTimestamp = endTimestampCalendar.getTime();
			lastMonth.setTime(endTimestamp);
			if (endTimestampCalendar.get(Calendar.DATE) != endTimestampCalendar.getActualMaximum(Calendar.DATE)) {
				lastMonth.roll(Calendar.MONTH, -1);
			}
			pages = new Page[(int)(limit * 1.5)];
			String minimuEditsText = System.getProperty(MINIMUM_EDITS_PROPERTY_KEY, "15");
			minimumEdits = Integer.parseInt(minimuEditsText);
		}
		
		public void endDocument() throws SAXException {
			System.err.println("Processed: " + revisionCounter);
			System.err.println("As of the last month"
					+ " (" + new SimpleDateFormat(YEARMONTH_FORMAT_STRING).format(beginTimestamp) + "),"
					+ " the Wikipedia received "
					+ (int)(editsInLastMonth / lastMonth.getActualMaximum(Calendar.DATE))
					+ " edits a day.");
			System.err.println("The " + totalEdits + " total edits made to the Wikipedia.");
//			System.err.println("Timestamp ParseException: " + timestampParseExceptionCount + " occured.");
		}
		
		private static Date getDateProperty(String key) throws SAXException {
			String property = System.getProperty(key);
			try {
				return DATE_FORMAT.parse(property);
			} catch (ParseException e) {
				throw new SAXException(e);
			}
		}
		
		public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
			String name = localName.equals("") ? qName : localName;
			elementStack.push(name);
			if (name.equals("namespace")) {
				String key = "";
				try {
					key = atts.getValue("key");
					ns = Integer.parseInt(key);
				} catch (NumberFormatException e) {
					throw new SAXException("ns: " + key, e);
				}
			}
		}
		
		private int revisionCounter = 0;
		
		private int totalEdits = 0;
		
		private int totalEditsInPeriod = 0;
		
		private int ns = 0;
		private String namespace = "";
		
		private String pageTitle = "";
		private Page page = null;
		
		private Page[] pages = null;
		
		public Page[] getPages() {
			return pages;
		}
		
		private Date timestamp = null;
		private String timestampString = "";
		
		private boolean ignoreRevision = false;
		
		private int timestampParseExceptionCount = 0;
		
		int pageCounter = 0;
		
		public void endElement(String uri, String localName, String qName) throws SAXException {
			final String name = elementStack.pop();
			if (name.equals("namespace")) {
				namespaces.add(namespace, ns);
				ns = 0;
				namespace = "";
			} else if (name.equals("page")) {
				if (page.getEdits() < minimumEdits) {
					return;
				}
				if (pageCounter <= (pages.length - 1)) {
					pageCounter ++;
					pages[pageCounter - 1] = page;
				} else if (pageCounter > (pages.length - 1)) {
					final Page lastPage = pages[pages.length - 1];
					if (page.getEdits() > lastPage.getEdits()) {
						pageCounter ++;
						pages[pages.length - 1] = page;
					}
				}
				if (pageCounter >= limit) {
					Arrays.sort(pages, PAGES_COMPARATOR);
				}
			} else if (name.equals("title")) {
				page = new Page(pageTitle, namespaces.ns(pageTitle));
				pageTitle = "";
			} else if (name.equals("timestamp")) {
				ignoreRevision = false;
				try {
					timestamp = TIMESTAMP_DUMP_FORMAT.parse(timestampString + "UTC");
					timestampString = "";
				} catch (ParseException e) {
					timestampParseExceptionCount++;
					ignoreRevision = true;
				}
			} else if (name.equals("revision")) {
				if (ignoreRevision) {
					return;
				}
				if (timestampBeroreOrEquals(timestamp)) {
					page.incrementTotalEdits();
					if (timestampIsInPeriod(timestamp)) {
						page.incrementEdits();
					}
				}
				final Calendar calendar = Calendar.getInstance();
				calendar.setTime(timestamp);
				if (calendar.get(Calendar.YEAR) == lastMonth.get(Calendar.YEAR)
						&& calendar.get(Calendar.MONTH) == lastMonth.get(Calendar.MONTH)) {
					editsInLastMonth ++;
				}
				if (timestampBeroreOrEquals(timestamp)) {
					totalEdits ++;
					if (timestampIsInPeriod(timestamp)) {
						totalEditsInPeriod ++;
					}
				}
				timestamp = null;
				revisionCounter++;
				final int LOG_INTERVAL = 10000;
				if (revisionCounter % LOG_INTERVAL == 0) {
					System.err.println("Processed: " + revisionCounter);
				}
			}
		}
		
		private boolean timestampIsInPeriod(Date timestamp) {
			return ( timestamp.equals(beginTimestamp) || timestamp.after(beginTimestamp) )
					&& timestampBeroreOrEquals(timestamp);
		}
		
		private boolean timestampBeroreOrEquals(Date timestamp) {
			return ( timestamp.before(endTimestamp) || timestamp.equals(endTimestamp) );
		}
		
		public void characters (char[] ch, int start, int length) {
			try {
				final String elementName = elementStack.peek();
				final String string = new String(ch, start, length);
				if (elementName.equals("namespace")) {
					namespace += string;
				}
				if (elementName.equals("title")) {
					pageTitle += string;
				}
				if (elementName.equals("timestamp")) {
					timestampString += string;
//					if (revisionCounter % 10000 == 0) {
//						System.err.println(ch.length);
//					}
				}
			} catch (EmptyStackException e) {
				// NOP
			} catch (IndexOutOfBoundsException e) {
				// NOP
			}
		}

	}

}

PagesByNumberOfRecentEdits_de.java

public class PagesByNumberOfRecentEdits_de extends PagesByNumberOfRecentEdits {

	/**
	 * The main() method for this application.
	 * @param args command-line arguments
	 */
	public static void main(String[] args) {
		new PagesByNumberOfRecentEdits_de().execute(args);
	}
	
	protected String getWikiName() {
		return "dewiki";
	}
	
	protected void printHeader() {
		getWriter().print("Frist: "
					+ DATE_FORMAT.format(getBeginTimestamp())
					+ " &mdash; "
					+ DATE_FORMAT.format(getEndTimestamp())
					+ " (UTC)");
		getWriter().println();
		getWriter().println();
	}
	
	protected String getTableHeader() {
		return "# !! Seite !! [[Hilfe:Namensräume|Namensräume]] !! Bearb. (30 T.) !! Bearb.";
	}

}

PagesByNumberOfRecentEdits_en.java

public class PagesByNumberOfRecentEdits_en extends PagesByNumberOfRecentEdits {

	/**
	 * The main() method for this application.
	 * @param args command-line arguments
	 */
	public static void main(String[] args) {
		new PagesByNumberOfRecentEdits_en().execute(args);
	}
	
	protected String getWikiName() {
		return "enwiki";
	}
	
	protected void printHeader() {
		getWriter().print("Period: "
					+ DATE_FORMAT.format(getBeginTimestamp())
					+ " &mdash; "
					+ DATE_FORMAT.format(getEndTimestamp())
					+ " (UTC)");
		getWriter().println();
		getWriter().println();
	}
	
	protected String getTableHeader() {
		return "Rank !! Page !! [[Wikipedia:Namespace|Namespace]] !! Recent Edits !! Total Edits";
	}

}

PagesByNumberOfRecentEdits_ja.java

public class PagesByNumberOfRecentEdits_ja extends PagesByNumberOfRecentEdits {

	/**
	 * The main() method for this application.
	 * @param args command-line arguments
	 */
	public static void main(String[] args) {
		new PagesByNumberOfRecentEdits_ja().execute(args);
	}
	
	protected String getWikiName() {
		return "jawiki";
	}
	
	protected void printHeader() {
		getWriter().print("期間: "
					+ DATE_FORMAT.format(getBeginTimestamp())
					+ " &mdash; "
					+ DATE_FORMAT.format(getEndTimestamp())
					+ " (UTC)");
		getWriter().println();
		getWriter().println();
	}
	
	protected String getTableHeader() {
		return "順位 !! ページ !! [[Help:名前空間|名前空間]] !! 編集回数 !! 総編集回数";
	}

}

PagesByNumberOfRecentEdits_zh.java

public class PagesByNumberOfRecentEdits_zh extends PagesByNumberOfRecentEdits {

	/**
	 * The main() method for this application.
	 * @param args command-line arguments
	 */
	public static void main(String[] args) {
		new PagesByNumberOfRecentEdits_zh().execute(args);
	}
	
	protected String getWikiName() {
		return "zhwiki";
	}
	
	protected void printHeader() {
		getWriter().print("期间: "
					+ DATE_FORMAT.format(getBeginTimestamp())
					+ " &mdash; "
					+ DATE_FORMAT.format(getEndTimestamp())
					+ " (UTC)");
		getWriter().println();
		getWriter().println();
	}
	
	protected String getTableHeader() {
		return "名次 !! 页面 !! [[Help:名字空间|名字空间]] !! 最近编辑次数 !! 累积编辑次数";
	}

}

Instructions

[edit]
java -Dbegin.date=2008-04-01 -Dend.date=2008-04-30 -Dlimit=5000 -Dmin.edits=80 PagesByNumberOfRecentEdits_en enwiki-20080501-stub-meta-history.xml.gz > result.txt
java -Dbegin.date=2008-04-01 -Dend.date=2008-04-30 -Dlimit=1000 -Dmin.edits=20 PagesByNumberOfRecentEdits_ja jawiki-20080501-stub-meta-history.xml.gz > result.txt
java -Dbegin.date=2008-04-01 -Dend.date=2008-04-30 -Dlimit=1000 -Dmin.edits=40 PagesByNumberOfRecentEdits_de dewiki-20080501-stub-meta-history.xml.gz > result.txt
java -Dbegin.date=2008-04-01 -Dend.date=2008-04-30 -Dlimit=1000 PagesByNumberOfRecentEdits_zh zhwiki-20080501-stub-meta-history.xml.gz > result.txt