/*
 * Decompiled with CFR 0.152.
 */
package visualizer.preprocessing;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Vector;
import java.util.regex.Pattern;
import visualizer.graph.Vertex;
import visualizer.preprocessing.Ngram;
import visualizer.preprocessing.PorterStemmer;
import visualizer.preprocessing.stopwords.StopwordsList;

/*
 * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
 */
public class Preprocessor {
    public static final int TF_IDF = 0;
    public static final int TF = 1;
    private ArrayList<Ngram> ngrams;
    private StopwordsList stp;

    public float[][] getMatrix(float max, float min, int numberGrams, int matrixType, boolean stemming, Vector<Vertex> vertex, StopwordsList stp) throws IOException {
        int i;
        this.stp = stp;
        HashMap<String, Integer> corporaNgrams = new HashMap<String, Integer>();
        HashMap<Vertex, HashMap<String, Integer>> documentsNgrams = new HashMap<Vertex, HashMap<String, Integer>>();
        this.getNgramsFrequency(corporaNgrams, documentsNgrams, numberGrams, stemming, vertex);
        ArrayList<Ngram> ngrams_aux = new ArrayList<Ngram>();
        for (String ngram : corporaNgrams.keySet()) {
            Ngram n = new Ngram(ngram);
            n.frequency = corporaNgrams.get(ngram);
            ngrams_aux.add(n);
        }
        Collections.sort(ngrams_aux);
        float[] docFrequency = new float[ngrams_aux.size()];
        Arrays.fill(docFrequency, 0.0f);
        for (i = 0; i < vertex.size(); ++i) {
            HashMap<String, Integer> docNgrams = documentsNgrams.get(vertex.get(i));
            for (int j = 0; j < ngrams_aux.size(); ++j) {
                if (!docNgrams.containsKey(((Ngram)ngrams_aux.get((int)j)).ngram)) continue;
                int n = j;
                docFrequency[n] = docFrequency[n] + 1.0f;
            }
        }
        this.ngrams = new ArrayList();
        for (i = 0; i < ngrams_aux.size(); ++i) {
            if (!(docFrequency[i] / (float)vertex.size() >= min) || !(docFrequency[i] / (float)vertex.size() <= max)) continue;
            this.ngrams.add((Ngram)ngrams_aux.get(i));
        }
        float[][] matrix = new float[vertex.size()][];
        for (int i2 = 0; i2 < vertex.size(); ++i2) {
            matrix[i2] = new float[this.ngrams.size()];
            HashMap<String, Integer> docNgrams = documentsNgrams.get(vertex.get(i2));
            int j = 0;
            for (Ngram n : this.ngrams) {
                if (docNgrams.containsKey(n.ngram)) {
                    matrix[i2][j++] = docNgrams.get(n.ngram).intValue();
                    continue;
                }
                matrix[i2][j++] = 0.0f;
            }
        }
        if (vertex.size() > 1 && matrixType == 0) {
            float[] documentFrequency = new float[matrix[0].length];
            for (int col = 0; col < matrix[0].length; ++col) {
                for (int lin = 0; lin < matrix.length; ++lin) {
                    if (matrix[lin][col] == 0.0f) continue;
                    int n = col;
                    documentFrequency[n] = documentFrequency[n] + 1.0f;
                }
            }
            for (int lin = 0; lin < matrix.length; ++lin) {
                for (int col = 0; col < matrix[lin].length; ++col) {
                    float tf = matrix[lin][col];
                    float idf = (float)Math.log((float)matrix.length / documentFrequency[col]);
                    matrix[lin][col] = tf * idf;
                }
            }
            this.normalize(matrix);
        }
        return matrix;
    }

    public ArrayList<Ngram> getNgrams() {
        return this.ngrams;
    }

    private void getNgramsFrequency(HashMap<String, Integer> corporaBigrams, HashMap<Vertex, HashMap<String, Integer>> documentsBigrams, int numberGrams, boolean stemming, Vector<Vertex> vertex) throws IOException {
        for (Vertex v : vertex) {
            Vector<String> words = this.getWordsFromFile(v, stemming);
            HashMap<String, Integer> ngrams = this.getNgrams(words, corporaBigrams, numberGrams);
            documentsBigrams.put(v, ngrams);
        }
    }

    private HashMap<String, Integer> getNgrams(Vector<String> words, HashMap<String, Integer> corporaNgrams, int numberGrams) {
        HashMap<String, Integer> ngrams = new HashMap<String, Integer>();
        for (int i = 0; i < words.size() - numberGrams + 1; ++i) {
            String ngram = "";
            for (int j = i; j < i + numberGrams; ++j) {
                ngram = ngram + words.elementAt(j) + "<>";
            }
            if (ngrams.containsKey(ngram)) {
                ngrams.put(ngram, ngrams.get(ngram) + 1);
            } else {
                ngrams.put(ngram, 1);
            }
            if (corporaNgrams.containsKey(ngram)) {
                corporaNgrams.put(ngram, corporaNgrams.get(ngram) + 1);
                continue;
            }
            corporaNgrams.put(ngram, 1);
        }
        return ngrams;
    }

    private Vector<String> getWordsFromFile(Vertex vertex, boolean stemming) throws IOException {
        Vector<String> resultingWords = new Vector<String>();
        PorterStemmer stemmer = new PorterStemmer();
        Pattern p = Pattern.compile("[^A-Za-z\u00e1\u00e0\u00e3\u00e2\u00e9\u00e8\u00ea\u00ed\u00ec\u00ee\u00f3\u00f2\u00f5\u00f4\u00fa\u00f9\u00fb\u00c1\u00c0\u00c3\u00c2\u00c9\u00c8\u00ca\u00cd\u00cc\u00ce\u00d3\u00d2\u00d5\u00d4\u00da\u00d9\u00db\u00e7\u00c7]");
        String filecontent = vertex.getWebElement().getContent();
        if (filecontent != null) {
            String[] paras = p.split(filecontent);
            for (int i = 0; i < paras.length; ++i) {
                String token;
                if (paras[i].trim().length() <= 0 || this.stp.isStopWord(token = paras[i].toLowerCase())) continue;
                if (stemming) {
                    resultingWords.add(stemmer.stem(token));
                    continue;
                }
                resultingWords.add(token);
            }
        }
        return resultingWords;
    }

    private void normalize(float[][] points) {
        for (int lin = 0; lin < points.length; ++lin) {
            int col;
            float max = points[lin][0];
            float min = points[lin][0];
            for (col = 1; col < points[lin].length; ++col) {
                if (max < points[lin][col]) {
                    max = points[lin][col];
                    continue;
                }
                if (!(min > points[lin][col])) continue;
                min = points[lin][col];
            }
            for (col = 0; col < points[lin].length; ++col) {
                points[lin][col] = max - min > 0.0f ? (points[lin][col] - min) / (max - min) : 0.0f;
            }
        }
    }
}

